pull/314/merge
ben 4 months ago committed by GitHub
commit 8d1d5af1aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -23,11 +23,11 @@ poetry run 01 --profile <profile_name>
### Standard Profiles
`default.py` is the default profile that is used when no profile is specified. The default TTS is OpenAI.
`default.py` is the default profile that is used when no profile is specified. The default TTS service is Elevenlabs.
`fast.py` uses elevenlabs and groq, which are the fastest providers.
`fast.py` uses Cartesia for TTS and Cerebras Llama3.1-8b, which are the fastest providers.
`local.py` uses coqui TTS and runs the --local explorer from Open Interpreter.
`local.py` requires additional setup to be used with LiveKit. Uses faster-whisper for STT, ollama/codestral for LLM (default), and piper for TTS (default).
### Custom Profiles
@ -46,38 +46,16 @@ poetry run 01 --profile <profile_name>
### Example Profile
````python
from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
from interpreter import Interpreter
interpreter = Interpreter()
# This is an Open Interpreter compatible profile.
# Visit https://01.openinterpreter.com/profile for all options.
# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
# 01 supports OpenAI, ElevenLabs, Cartesia, and Coqui (Local) TTS providers
# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
interpreter.tts = "openai"
# Connect your 01 to a language model
interpreter.llm.model = "gpt-4o"
interpreter.llm.context_window = 100000
interpreter.llm.max_tokens = 4096
# interpreter.llm.api_key = "<your_openai_api_key_here>"
# Tell your 01 where to find and save skills
interpreter.computer.skills.path = "./skills"
# Extra settings
interpreter.computer.import_computer_api = True
interpreter.computer.import_skills = True
interpreter.computer.run("python", "computer") # This will trigger those imports
interpreter.auto_run = True
interpreter.loop = True
interpreter.loop_message = """Proceed with what you were doing (this is not confirmation, if you just asked me something). You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task is done, say exactly 'The task is done.' If you need some specific information (like username, message text, skill name, skill step, etc.) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going. CRITICAL: REMEMBER TO FOLLOW ALL PREVIOUS INSTRUCTIONS. If I'm teaching you something, remember to run the related `computer.skills.new_skill` function."""
interpreter.loop_breakers = [
"The task is done.",
"The task is impossible.",
"Let me know what you'd like to do next.",
"Please provide more information.",
]
interpreter.tts = "elevenlabs"
interpreter.stt = "deepgram"
# Set the identity and personality of your 01
interpreter.system_message = """
@ -89,17 +67,37 @@ You can install new packages.
Be concise. Your messages are being read aloud to the user. DO NOT MAKE PLANS. RUN CODE QUICKLY.
Try to spread complex tasks over multiple code blocks. Don't try to complex tasks in one go.
Manually summarize text."""
# Add additional instructions for the 01
interpreter.instructions = "Be very concise in your responses."
# Connect your 01 to a language model
interpreter.model = "claude-3-5-sonnet-20240620"
interpreter.provider = "anthropic"
interpreter.max_tokens = 4096
interpreter.temperature = 0
interpreter.api_key = "<your_anthropic_api_key_here>"
# Extra settings
interpreter.tools = ["interpreter", "editor"] # Enabled tool modules
interpreter.auto_run = True # Whether to auto-run tools without confirmation
interpreter.tool_calling = True # Whether to allow tool/function calling
interpreter.allowed_paths = [] # List of allowed paths
interpreter.allowed_commands = [] # List of allowed commands
````
### Hosted LLMs
The default LLM for 01 is GPT-4-Turbo. You can find this in the default profile in `software/source/server/profiles/default.py`.
The default LLM for 01 is Claude 3.5 Sonnet. You can find this in the default profile in `software/source/server/profiles/default.py`.
The fast profile uses Llama3-8b served by Groq. You can find this in the fast profile in `software/source/server/profiles/fast.py`.
The fast profile uses Llama3.1-8b served by Cerebras. You can find this in the fast profile in `software/source/server/profiles/fast.py`.
```python
# Set your profile with a hosted LLM
interpreter.llm.model = "gpt-4o"
interpreter.model = "claude-3-5-sonnet-20240620"
interpreter.provider = "anthropic"
```
### Local LLMs
@ -110,7 +108,7 @@ Using the local profile launches the Local Explorer where you can select your in
```python
# Set your profile with a local LLM
interpreter.llm.model = "ollama/codestral"
interpreter.model = "ollama/codestral"
# You can also use the Local Explorer to interactively select your model
interpreter.local_setup()
@ -118,26 +116,30 @@ interpreter.local_setup()
### Hosted TTS
01 supports OpenAI and Elevenlabs for hosted TTS.
01 supports OpenAI, Elevenlabs, and Cartesia for hosted TTS.
```python
# Set your profile with a hosted TTS service
interpreter.tts = "elevenlabs"
```
### Local TTS
### Local TTS and STT with LiveKit
For local TTS, Coqui is used.
We recommend having Docker installed for the easiest setup. Local TTS and STT relies on the [openedai-speech](https://github.com/matatonic/openedai-speech?tab=readme-ov-file) and [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) repositories respectively.
#### Local TTS
1. Clone the [openedai-speech](https://github.com/matatonic/openedai-speech?tab=readme-ov-file) repository
2. Follow the Docker Image instructions for your system. Default run `docker compose -f docker-compose.min.yml up --publish 9001:8000` in the root.
3. Set your profile with local TTS service
```python
# Set your profile with a local TTS service
interpreter.tts = "coqui"
interpreter.tts = "local"
```
<Note>
When using the Livekit server, the interpreter.tts setting in your profile
will be ignored. The Livekit server currently only works with Deepgram for
speech recognition and Eleven Labs for text-to-speech. We are working on
introducing all-local functionality for the Livekit server as soon as
possible.
</Note>
#### Local STT
1. Clone the [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) repository
2. Follow the Docker Compose Quick Start instructions for your respective system.
3. Run `docker run --publish 9002:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=Systran/faster-whisper-small --detach fedirz/faster-whisper-server:latest-cpu` to publish to port 8001 instead of the default 8000 (since our TTS uses this port).
4. Set your profile with local STT service
```python
interpreter.stt = "local"
```

@ -69,6 +69,18 @@ Replace the placeholders with your actual API keys.
### Starting the Server
**To use the mobile app, run the following command**
```bash
poetry run 01 --server livekit --qr --expose
```
To customize the profile, append the --profile flag with the profile name:
```bash
poetry run 01 --server livekit --qr --expose --profile fast
```
To start the Livekit server, run the following command:
```bash
@ -87,18 +99,10 @@ To expose over the internet via ngrok
poetry run 01 --server livekit --expose
```
In order to use the mobile app over the web, use both flags
```bash
poetry run 01 --server livekit --qr --expose
```
<Note>
Currently, our Livekit server only works with Deepgram and Eleven Labs. We are
working to introduce all-local functionality as soon as possible. By setting
your profile (see [Configure Your Profile](/software/configure)), you can
still change your LLM to be a local LLM, but the `interpreter.tts` value will
be ignored for the Livekit server.
Livekit server now supports Local STT and TTS for fully local pipeline.
Setup instructions are provided in the [configuring your 01](/server/configure#local-tts-and-stt-with-livekit) section.
</Note>
## Livekit vs. Light Server

@ -1,62 +1,68 @@
from yaspin import yaspin
spinner = yaspin()
spinner.start()
import subprocess
import time
import os
import typer
import ngrok
import platform
import threading
import os
import importlib
from source.server.server import start_server
import subprocess
import webview
import socket
import json
import webbrowser
import psutil
import ngrok
import segno
import json
from pathlib import Path
from livekit import api
import time
from dotenv import load_dotenv
import signal
from source.server.livekit.worker import main as worker_main
from source.server.livekit.multimodal import main as multimodal_main
import warnings
import requests
load_dotenv()
from dotenv import load_dotenv
system_type = platform.system()
load_dotenv()
system_type = platform.system()
app = typer.Typer()
ROOM_NAME = "my-room"
def pre_clean_process(port):
"""Find and kill process running on specified port"""
for proc in psutil.process_iter(['pid', 'name', 'connections']):
try:
for conn in proc.connections():
if conn.laddr.port == port:
print(f"Killing process {proc.pid} ({proc.name()}) on port {port}")
proc.terminate()
proc.wait()
return True
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
return False
def cleanup_processes(processes):
for process in processes:
if process.poll() is None: # if process is still running
process.terminate()
process.wait() # wait for process to terminate
@app.command()
def run(
server: str = typer.Option(
None,
"--server",
help="Run server (accepts `livekit` or `light`)",
),
server_host: str = typer.Option(
lk_host: str = typer.Option(
"0.0.0.0",
"--server-host",
help="Specify the server host where the server will deploy",
"--lk-host",
help="Specify the server host where the livekit server will deploy. For other devices on your network to connect to it, keep it on default `0.0.0.0`",
),
server_port: int = typer.Option(
lk_port: int = typer.Option(
10101,
"--server-port",
help="Specify the server port where the server will deploy",
),
expose: bool = typer.Option(False, "--expose", help="Expose server over the internet"),
domain: str = typer.Option(None, "--domain", help="Use `--expose` with a custom ngrok domain"),
client: str = typer.Option(None, "--client", help="Run client of a particular type. Accepts `light-python`, defaults to `light-python`"),
server_url: str = typer.Option(
None,
"--server-url",
help="Specify the server URL that the --client should expect. Defaults to server-host and server-port",
),
qr: bool = typer.Option(
False, "--qr", help="Display QR code containing the server connection information (will be ngrok url if `--expose` is used)"
"--lk-port",
help="Specify the server port where the livekit server will deploy",
),
domain: str = typer.Option(None, "--domain", help="Pass in a custom ngrok domain to expose the livekit server over the internet"),
client: str = typer.Option(None, "--client", help="Run client of a particular type. Accepts `meet` or `mobile`, defaults to `meet`"),
profiles: bool = typer.Option(
False,
"--profiles",
@ -76,19 +82,12 @@ def run(
False,
"--multimodal",
help="Run the multimodal agent",
),
)
):
threads = []
# Handle `01` with no arguments, which should start server + client
if not server and not client:
server = "light"
client = "light-python"
### PROFILES
profiles_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "source", "server", "profiles")
# preprocess ports
ports = [10101, 8000, 3000]
for port in ports:
pre_clean_process(port)
if profiles:
if platform.system() == "Windows":
@ -101,6 +100,8 @@ def run(
subprocess.Popen(['open', profiles_dir])
exit(0)
profiles_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "source", "server", "profiles")
if profile:
if not os.path.isfile(profile):
profile = os.path.join(profiles_dir, profile)
@ -110,192 +111,66 @@ def run(
print(f"Invalid profile path: {profile}")
exit(1)
# Load the profile module from the provided path
spec = importlib.util.spec_from_file_location("profile", profile)
profile_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(profile_module)
# Get the interpreter from the profile
interpreter = profile_module.interpreter
### SERVER
if system_type == "Windows":
server_host = "localhost"
if not server_url:
server_url = f"{server_host}:{server_port}"
if server:
### LIGHT SERVER (required by livekit)
if server == "light":
light_server_port = server_port
light_server_host = server_host
voice = True # The light server will support voice
elif server == "livekit":
# The light server should run at a different port if we want to run a livekit server
spinner.stop()
print(f"Starting light server (required for livekit server) on localhost, on the port before `--server-port` (port {server_port-1}), unless the `AN_OPEN_PORT` env var is set.")
print(f"The livekit server will be started on port {server_port}.")
light_server_port = os.getenv('AN_OPEN_PORT', server_port-1)
light_server_host = "localhost"
voice = False # The light server will NOT support voice. It will just run Open Interpreter. The Livekit server will handle voice
server_thread = threading.Thread(
target=start_server,
args=(
light_server_host,
light_server_port,
interpreter,
voice,
debug
),
)
spinner.stop()
print("Starting server...")
server_thread.start()
threads.append(server_thread)
if server == "livekit":
### LIVEKIT SERVER
OI_CMD = f"interpreter --serve --profile {profile}"
oi_server = subprocess.Popen(OI_CMD, shell=True)
print("Interpreter server started")
def run_command(command):
subprocess.run(command, shell=True, check=True)
# Start the livekit server
print("Starting livekit server...")
if debug:
command = f'livekit-server --dev --bind "{server_host}" --port {server_port}'
else:
command = f'livekit-server --dev --bind "{server_host}" --port {server_port} > /dev/null 2>&1'
livekit_thread = threading.Thread(
target=run_command, args=(command,)
)
time.sleep(7)
livekit_thread.start()
threads.append(livekit_thread)
local_livekit_url = f"ws://{server_host}:{server_port}"
if expose:
### EXPOSE OVER INTERNET
listener = ngrok.forward(f"{server_host}:{server_port}", authtoken_from_env=True, domain=domain)
url = listener.url()
LK_CMD = f"livekit-server --dev --bind {lk_host} --port {lk_port}"
else:
LK_CMD = f"livekit-server --dev --bind {lk_host} --port {lk_port} > /dev/null 2>&1"
### GET LOCAL URL
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip_address = s.getsockname()[0]
s.close()
url = f"http://{ip_address}:{server_port}"
if server == "livekit":
print("Livekit server will run at:", url)
lk_server = subprocess.Popen(LK_CMD, shell=True)
print("Livekit server started")
time.sleep(2)
lk_url = f"http://{lk_host}:{lk_port}"
participant_token = str(api.AccessToken('devkey', 'secret') \
.with_identity("Participant") \
.with_name("You") \
.with_grants(api.VideoGrants(
room_join=True,
room=ROOM_NAME,))
.to_jwt())
### CLIENT
if client:
module = importlib.import_module(
f".clients.{client}.client", package="source"
)
client_thread = threading.Thread(target=module.run, args=[server_url, debug])
spinner.stop()
print("Starting client...")
client_thread.start()
threads.append(client_thread)
### WAIT FOR THREADS TO FINISH, HANDLE CTRL-C
# Signal handler for termination signals
def signal_handler(sig, frame):
print("Termination signal received. Shutting down...")
for thread in threads:
if thread.is_alive():
# Kill subprocess associated with thread
subprocess.run(f"pkill -P {os.getpid()}", shell=True)
os._exit(0)
# Register signal handler for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
processes = [lk_server, oi_server]
# Verify the server is running
for attempt in range(10):
try:
response = requests.get(url)
status = "OK" if response.status_code == 200 else "Not OK"
if status == "OK":
break
except requests.RequestException:
pass
time.sleep(1)
else:
raise Exception(f"Server at {url} failed to respond after 10 attempts")
if client == 'mobile':
listener = ngrok.forward(f"{lk_host}:{lk_port}", authtoken_from_env=True, domain=domain)
lk_url = listener.url()
print(f"Livekit server forwarded to: {lk_url}")
### DISPLAY QR CODE
if qr:
def display_qr_code():
time.sleep(10)
content = json.dumps({"livekit_server": url})
print("Scan the QR code below with your mobile app to connect to the livekit server.")
content = json.dumps({"livekit_server": lk_url, "token": participant_token})
qr_code = segno.make(content)
qr_code.terminal(compact=True)
else: # meet client
# Get the path to the meet client directory
meet_client_path = Path(__file__).parent / "source" / "clients" / "meet"
qr_thread = threading.Thread(target=display_qr_code)
qr_thread.start()
threads.append(qr_thread)
print("Starting Next.js dev server...")
next_server = subprocess.Popen(["pnpm", "dev"], cwd=meet_client_path,)
print("Next.js dev server started")
### START LIVEKIT WORKER
if server == "livekit":
time.sleep(1)
# These are needed to communicate with the worker's entrypoint
os.environ['INTERPRETER_SERVER_HOST'] = light_server_host
os.environ['INTERPRETER_SERVER_PORT'] = str(light_server_port)
os.environ['01_TTS'] = interpreter.tts
os.environ['01_STT'] = interpreter.stt
time.sleep(2)
meet_url = f'http://localhost:3000/custom?liveKitUrl={lk_url.replace("http", "ws")}&token={participant_token}'
print(f"\nOpening meet interface at: {meet_url}")
webbrowser.open(meet_url)
token = str(api.AccessToken('devkey', 'secret') \
.with_identity("identity") \
.with_name("my name") \
.with_grants(api.VideoGrants(
room_join=True,
room="my-room",
)).to_jwt())
# meet_url = f'http://localhost:3000/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n'
meet_url = f'https://meet.livekit.io/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n'
print("\n")
print("For debugging, you can join a video call with your assistant. Click the link below, then send a chat message that says {CONTEXT_MODE_OFF}, then begin speaking:")
print(meet_url)
processes.append(next_server)
for attempt in range(30):
try:
print("Starting worker...")
if multimodal:
multimodal_main(local_livekit_url)
multimodal_main(lk_url)
else:
worker_main(local_livekit_url)
except KeyboardInterrupt:
print("Exiting.")
raise
except Exception as e:
print(f"Error occurred: {e}")
print("Retrying...")
time.sleep(1)
# Wait for all threads to complete
for thread in threads:
thread.join()
worker_main(lk_url)
print("Worker started")
except KeyboardInterrupt:
# On KeyboardInterrupt, send SIGINT to self
os.kill(os.getpid(), signal.SIGINT)
print("\nReceived interrupt signal, shutting down...")
finally:
print("Cleaning up processes...")
cleanup_processes(processes)

7565
software/poetry.lock generated

File diff suppressed because one or more lines are too long

@ -13,20 +13,21 @@ readme = "../README.md"
[tool.poetry.dependencies]
python = ">=3.10,<3.12"
livekit = "^0.17.2"
livekit-agents = "^0.10.0"
livekit-agents = "^0.12.0"
livekit-plugins-deepgram = "^0.6.7"
livekit-plugins-openai = "^0.10.1"
livekit-plugins-silero = "^0.7.1"
livekit-plugins-elevenlabs = "^0.7.5"
livekit-plugins-cartesia = "^0.4.2"
segno = "^1.6.1"
open-interpreter = {extras = ["os", "server"], version = "^0.3.12"} # You should add a "browser" extra, so selenium isn't in the main package
open-interpreter = { git = "https://github.com/OpenInterpreter/open-interpreter.git", branch = "development" }
ngrok = "^1.4.0"
realtimetts = {extras = ["all"], version = "^0.4.5"}
realtimestt = "^0.2.41"
pynput = "^1.7.7"
yaspin = "^3.0.2"
pywebview = "^5.2"
livekit-plugins-cartesia = "^0.4.2"
resampy = "^0.4.2"
[build-system]
requires = ["poetry-core"]

@ -0,0 +1,30 @@
# 1. Copy this file and rename it to .env.local
# 2. Update the enviroment variables below.
# REQUIRED SETTINGS
# #################
# If you are using LiveKit Cloud, the API key and secret can be generated from the Cloud Dashboard.
LIVEKIT_API_KEY=
LIVEKIT_API_SECRET=
# URL pointing to the LiveKit server. (example: `wss://my-livekit-project.livekit.cloud`)
LIVEKIT_URL=http://localhost:10101
# OPTIONAL SETTINGS
# #################
# Recording
# S3_KEY_ID=
# S3_KEY_SECRET=
# S3_ENDPOINT=
# S3_BUCKET=
# S3_REGION=
# PUBLIC
# Uncomment settings menu when using a LiveKit Cloud, it'll enable Krisp noise filters.
# NEXT_PUBLIC_SHOW_SETTINGS_MENU=true
# NEXT_PUBLIC_LK_RECORD_ENDPOINT=/api/record
# Optional, to pipe logs to datadog
# NEXT_PUBLIC_DATADOG_CLIENT_TOKEN=client-token
# NEXT_PUBLIC_DATADOG_SITE=datadog-site

@ -0,0 +1,3 @@
{
"extends": "next/core-web-vitals"
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 832 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

@ -0,0 +1,31 @@
<svg width="270" height="151" viewBox="0 0 270 151" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect width="270" height="151" fill="#070707"/>
<rect x="8.5" y="8.5" width="192" height="134" rx="1.5" fill="#131313"/>
<rect x="8.5" y="8.5" width="192" height="134" rx="1.5" stroke="#1F1F1F"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M101.167 71.4998C101.167 69.6589 102.66 68.1665 104.501 68.1665C106.342 68.1665 107.834 69.6589 107.834 71.4998C107.834 73.3408 106.342 74.8332 104.501 74.8332C102.66 74.8332 101.167 73.3408 101.167 71.4998Z" fill="#666666"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M97.834 82.1665C97.834 78.4846 100.819 75.4998 104.501 75.4998C108.183 75.4998 111.167 78.4846 111.167 82.1665V82.8332H97.834V82.1665Z" fill="#666666"/>
<rect x="209.5" y="8.5" width="52" height="38.6667" rx="1.5" fill="#131313"/>
<rect x="209.5" y="8.5" width="52" height="38.6667" rx="1.5" stroke="#1F1F1F"/>
<g clip-path="url(#clip0_834_19648)">
<path fill-rule="evenodd" clip-rule="evenodd" d="M232.167 23.8333C232.167 21.9924 233.66 20.5 235.501 20.5C237.342 20.5 238.834 21.9924 238.834 23.8333C238.834 25.6743 237.342 27.1667 235.501 27.1667C233.66 27.1667 232.167 25.6743 232.167 23.8333Z" fill="#666666"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M228.834 34.5C228.834 30.8181 231.819 27.8333 235.501 27.8333C239.183 27.8333 242.167 30.8181 242.167 34.5V35.1667H228.834V34.5Z" fill="#666666"/>
</g>
<rect x="209.5" y="56.1665" width="52" height="38.6667" rx="1.5" fill="#131313"/>
<rect x="209.5" y="56.1665" width="52" height="38.6667" rx="1.5" stroke="#CCCCCC"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M232.167 71.4998C232.167 69.6589 233.66 68.1665 235.501 68.1665C237.342 68.1665 238.834 69.6589 238.834 71.4998C238.834 73.3408 237.342 74.8332 235.501 74.8332C233.66 74.8332 232.167 73.3408 232.167 71.4998Z" fill="#CCCCCC"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M228.834 82.1665C228.834 78.4846 231.819 75.4998 235.501 75.4998C239.183 75.4998 242.167 78.4846 242.167 82.1665V82.8332H228.834V82.1665Z" fill="#CCCCCC"/>
<rect x="209.5" y="103.833" width="52" height="38.6667" rx="1.5" fill="#131313"/>
<rect x="209.5" y="103.833" width="52" height="38.6667" rx="1.5" stroke="#1F1F1F"/>
<g clip-path="url(#clip1_834_19648)">
<path fill-rule="evenodd" clip-rule="evenodd" d="M232.167 119.167C232.167 117.326 233.66 115.833 235.501 115.833C237.342 115.833 238.834 117.326 238.834 119.167C238.834 121.008 237.342 122.5 235.501 122.5C233.66 122.5 232.167 121.008 232.167 119.167Z" fill="#666666"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M228.834 129.833C228.834 126.152 231.819 123.167 235.501 123.167C239.183 123.167 242.167 126.152 242.167 129.833V130.5H228.834V129.833Z" fill="#666666"/>
</g>
<defs>
<clipPath id="clip0_834_19648">
<rect width="16" height="16" fill="white" transform="translate(227.5 19.8335)"/>
</clipPath>
<clipPath id="clip1_834_19648">
<rect width="16" height="16" fill="white" transform="translate(227.5 115.167)"/>
</clipPath>
</defs>
</svg>

After

Width:  |  Height:  |  Size: 3.0 KiB

@ -0,0 +1,33 @@
name: Sync main to sandbox-production
on:
push:
branches:
- main
permissions:
contents: write
pull-requests: write
jobs:
sync:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history so we can force push
- name: Set up Git
run: |
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@livekit.io'
- name: Sync to sandbox-production
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
git checkout sandbox-production || git checkout -b sandbox-production
git merge --strategy-option theirs main
git push origin sandbox-production

@ -0,0 +1,38 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.js
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# local env files
.env.local
.env.development.local
.env.test.local
.env.production.local
# vercel
.vercel
# typescript
*.tsbuildinfo

@ -0,0 +1,3 @@
.github/
.next/
node_modules/

@ -0,0 +1,7 @@
{
"singleQuote": true,
"trailingComma": "all",
"semi": true,
"tabWidth": 2,
"printWidth": 100
}

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

@ -0,0 +1,16 @@
# LiveKit Meet Client
This is a clone of the LiveKit Meet open source video conferencing app built on [LiveKit Components](https://github.com/livekit/components-js), [LiveKit Cloud](https://livekit.io/cloud), and Next.js. Used as a simple web interface to the 01 with screenshare and camera functionality. Can be run using a fully local model, STT, and TTS.
## Usage
Run the following command in the software directory to open a Meet instance.
```
poetry run 01 --server livekit --meet
```
## Setup
Ensure that you're in the meet directory. Then run `pnpm install` to install all dependencies. You're now all set to get up and running!

@ -0,0 +1,81 @@
import { randomString } from '@/lib/client-utils';
import { ConnectionDetails } from '@/lib/types';
import { AccessToken, AccessTokenOptions, VideoGrant } from 'livekit-server-sdk';
import { NextRequest, NextResponse } from 'next/server';
const API_KEY = process.env.LIVEKIT_API_KEY;
const API_SECRET = process.env.LIVEKIT_API_SECRET;
const LIVEKIT_URL = process.env.LIVEKIT_URL;
export async function GET(request: NextRequest) {
try {
// Parse query parameters
const roomName = request.nextUrl.searchParams.get('roomName');
const participantName = request.nextUrl.searchParams.get('participantName');
const metadata = request.nextUrl.searchParams.get('metadata') ?? '';
const region = request.nextUrl.searchParams.get('region');
const livekitServerUrl = region ? getLiveKitURL(region) : LIVEKIT_URL;
if (livekitServerUrl === undefined) {
throw new Error('Invalid region');
}
if (typeof roomName !== 'string') {
return new NextResponse('Missing required query parameter: roomName', { status: 400 });
}
if (participantName === null) {
return new NextResponse('Missing required query parameter: participantName', { status: 400 });
}
// Generate participant token
const participantToken = await createParticipantToken(
{
identity: `${participantName}__${randomString(4)}`,
name: participantName,
metadata,
},
roomName,
);
// Return connection details
const data: ConnectionDetails = {
serverUrl: livekitServerUrl,
roomName: roomName,
participantToken: participantToken,
participantName: participantName,
};
return NextResponse.json(data);
} catch (error) {
if (error instanceof Error) {
return new NextResponse(error.message, { status: 500 });
}
}
}
function createParticipantToken(userInfo: AccessTokenOptions, roomName: string) {
const at = new AccessToken(API_KEY, API_SECRET, userInfo);
at.ttl = '5m';
const grant: VideoGrant = {
room: roomName,
roomJoin: true,
canPublish: true,
canPublishData: true,
canSubscribe: true,
};
at.addGrant(grant);
return at.toJwt();
}
/**
* Get the LiveKit server URL for the given region.
*/
function getLiveKitURL(region: string | null): string {
let targetKey = 'LIVEKIT_URL';
if (region) {
targetKey = `LIVEKIT_URL_${region}`.toUpperCase();
}
const url = process.env[targetKey];
if (!url) {
throw new Error(`${targetKey} is not defined`);
}
return url;
}

@ -0,0 +1,70 @@
import { EgressClient, EncodedFileOutput, S3Upload } from 'livekit-server-sdk';
import { NextRequest, NextResponse } from 'next/server';
export async function GET(req: NextRequest) {
try {
const roomName = req.nextUrl.searchParams.get('roomName');
/**
* CAUTION:
* for simplicity this implementation does not authenticate users and therefore allows anyone with knowledge of a roomName
* to start/stop recordings for that room.
* DO NOT USE THIS FOR PRODUCTION PURPOSES AS IS
*/
if (roomName === null) {
return new NextResponse('Missing roomName parameter', { status: 403 });
}
const {
LIVEKIT_API_KEY,
LIVEKIT_API_SECRET,
LIVEKIT_URL,
S3_KEY_ID,
S3_KEY_SECRET,
S3_BUCKET,
S3_ENDPOINT,
S3_REGION,
} = process.env;
const hostURL = new URL(LIVEKIT_URL!);
hostURL.protocol = 'https:';
const egressClient = new EgressClient(hostURL.origin, LIVEKIT_API_KEY, LIVEKIT_API_SECRET);
const existingEgresses = await egressClient.listEgress({ roomName });
if (existingEgresses.length > 0 && existingEgresses.some((e) => e.status < 2)) {
return new NextResponse('Meeting is already being recorded', { status: 409 });
}
const fileOutput = new EncodedFileOutput({
filepath: `${new Date(Date.now()).toISOString()}-${roomName}.mp4`,
output: {
case: 's3',
value: new S3Upload({
endpoint: S3_ENDPOINT,
accessKey: S3_KEY_ID,
secret: S3_KEY_SECRET,
region: S3_REGION,
bucket: S3_BUCKET,
}),
},
});
await egressClient.startRoomCompositeEgress(
roomName,
{
file: fileOutput,
},
{
layout: 'speaker',
},
);
return new NextResponse(null, { status: 200 });
} catch (error) {
if (error instanceof Error) {
return new NextResponse(error.message, { status: 500 });
}
}
}

@ -0,0 +1,39 @@
import { EgressClient } from 'livekit-server-sdk';
import { NextRequest, NextResponse } from 'next/server';
export async function GET(req: NextRequest) {
try {
const roomName = req.nextUrl.searchParams.get('roomName');
/**
* CAUTION:
* for simplicity this implementation does not authenticate users and therefore allows anyone with knowledge of a roomName
* to start/stop recordings for that room.
* DO NOT USE THIS FOR PRODUCTION PURPOSES AS IS
*/
if (roomName === null) {
return new NextResponse('Missing roomName parameter', { status: 403 });
}
const { LIVEKIT_API_KEY, LIVEKIT_API_SECRET, LIVEKIT_URL } = process.env;
const hostURL = new URL(LIVEKIT_URL!);
hostURL.protocol = 'https:';
const egressClient = new EgressClient(hostURL.origin, LIVEKIT_API_KEY, LIVEKIT_API_SECRET);
const activeEgresses = (await egressClient.listEgress({ roomName })).filter(
(info) => info.status < 2,
);
if (activeEgresses.length === 0) {
return new NextResponse('No active recording found', { status: 404 });
}
await Promise.all(activeEgresses.map((info) => egressClient.stopEgress(info.egressId)));
return new NextResponse(null, { status: 200 });
} catch (error) {
if (error instanceof Error) {
return new NextResponse(error.message, { status: 500 });
}
}
}

@ -0,0 +1,210 @@
import type {
MessageDecoder,
MessageEncoder,
TrackReferenceOrPlaceholder,
WidgetState,
} from '@livekit/components-react';
import { isTrackReference } from '@livekit/components-react';
import { log } from './logger';
import { isWeb } from './detectMobileBrowser';
import { isEqualTrackRef } from './track-reference';
import { RoomEvent, Track } from 'livekit-client';
import * as React from 'react';
import type { MessageFormatter } from '@livekit/components-react';
import {
CarouselLayout,
ConnectionStateToast,
FocusLayout,
FocusLayoutContainer,
GridLayout,
LayoutContextProvider,
ParticipantTile,
RoomAudioRenderer,
} from '@livekit/components-react';
import { useCreateLayoutContext } from '@livekit/components-react';
import { usePinnedTracks, useTracks } from '@livekit/components-react';
import { ControlBar } from '@livekit/components-react';
import { useWarnAboutMissingStyles } from './useWarnAboutMissingStyles';
import { useLocalParticipant } from '@livekit/components-react';
/**
* @public
*/
export interface VideoConferenceProps extends React.HTMLAttributes<HTMLDivElement> {
chatMessageFormatter?: MessageFormatter;
chatMessageEncoder?: MessageEncoder;
chatMessageDecoder?: MessageDecoder;
/** @alpha */
SettingsComponent?: React.ComponentType;
}
/**
* The `VideoConference` ready-made component is your drop-in solution for a classic video conferencing application.
* It provides functionality such as focusing on one participant, grid view with pagination to handle large numbers
* of participants, basic non-persistent chat, screen sharing, and more.
*
* @remarks
* The component is implemented with other LiveKit components like `FocusContextProvider`,
* `GridLayout`, `ControlBar`, `FocusLayoutContainer` and `FocusLayout`.
* You can use these components as a starting point for your own custom video conferencing application.
*
* @example
* ```tsx
* <LiveKitRoom>
* <VideoConference />
* <LiveKitRoom>
* ```
* @public
*/
export function VideoConference({
chatMessageFormatter,
chatMessageDecoder,
chatMessageEncoder,
SettingsComponent,
...props
}: VideoConferenceProps) {
const [widgetState, setWidgetState] = React.useState<WidgetState>({
showChat: false,
unreadMessages: 0,
showSettings: false,
});
const lastAutoFocusedScreenShareTrack = React.useRef<TrackReferenceOrPlaceholder | null>(null);
const tracks = useTracks(
[
{ source: Track.Source.Camera, withPlaceholder: true },
{ source: Track.Source.ScreenShare, withPlaceholder: false },
],
{ updateOnlyOn: [RoomEvent.ActiveSpeakersChanged], onlySubscribed: false },
);
const widgetUpdate = (state: WidgetState) => {
log.debug('updating widget state', state);
setWidgetState(state);
};
const layoutContext = useCreateLayoutContext();
const screenShareTracks = tracks
.filter(isTrackReference)
.filter((track) => track.publication.source === Track.Source.ScreenShare);
const focusTrack = usePinnedTracks(layoutContext)?.[0];
const carouselTracks = tracks.filter((track) => !isEqualTrackRef(track, focusTrack));
const { localParticipant } = useLocalParticipant();
const [isAlwaysListening, setIsAlwaysListening] = React.useState(false);
const toggleAlwaysListening = () => {
const newValue = !isAlwaysListening;
setIsAlwaysListening(newValue);
handleAlwaysListeningToggle(newValue);
};
const handleAlwaysListeningToggle = (newValue: boolean) => {
if (newValue) {
console.log("SETTING VIDEO CONTEXT ON")
const data = new TextEncoder().encode("{VIDEO_CONTEXT_ON}")
localParticipant.publishData(data, {reliable: true, topic: "video_context"})
} else {
console.log("SETTING VIDEO CONTEXT OFF")
const data = new TextEncoder().encode("{VIDEO_CONTEXT_OFF}")
localParticipant.publishData(data, {reliable: true, topic: "video_context"})
}
}
React.useEffect(() => {
// If screen share tracks are published, and no pin is set explicitly, auto set the screen share.
if (
screenShareTracks.some((track) => track.publication.isSubscribed) &&
lastAutoFocusedScreenShareTrack.current === null
) {
log.debug('Auto set screen share focus:', { newScreenShareTrack: screenShareTracks[0] });
layoutContext.pin.dispatch?.({ msg: 'set_pin', trackReference: screenShareTracks[0] });
lastAutoFocusedScreenShareTrack.current = screenShareTracks[0];
} else if (
lastAutoFocusedScreenShareTrack.current &&
!screenShareTracks.some(
(track) =>
track.publication.trackSid ===
lastAutoFocusedScreenShareTrack.current?.publication?.trackSid,
)
) {
log.debug('Auto clearing screen share focus.');
layoutContext.pin.dispatch?.({ msg: 'clear_pin' });
lastAutoFocusedScreenShareTrack.current = null;
}
if (focusTrack && !isTrackReference(focusTrack)) {
const updatedFocusTrack = tracks.find(
(tr) =>
tr.participant.identity === focusTrack.participant.identity &&
tr.source === focusTrack.source,
);
if (updatedFocusTrack !== focusTrack && isTrackReference(updatedFocusTrack)) {
layoutContext.pin.dispatch?.({ msg: 'set_pin', trackReference: updatedFocusTrack });
}
}
}, [
screenShareTracks
.map((ref) => `${ref.publication.trackSid}_${ref.publication.isSubscribed}`)
.join(),
focusTrack?.publication?.trackSid,
tracks,
]);
useWarnAboutMissingStyles();
return (
<div className="lk-video-conference" {...props}>
{isWeb() && (
<LayoutContextProvider
value={layoutContext}
// onPinChange={handleFocusStateChange}
onWidgetChange={widgetUpdate}
>
<div className="lk-video-conference-inner">
{!focusTrack ? (
<div className="lk-grid-layout-wrapper">
<GridLayout tracks={tracks}>
<ParticipantTile />
</GridLayout>
</div>
) : (
<div className="lk-focus-layout-wrapper">
<FocusLayoutContainer>
<CarouselLayout tracks={carouselTracks}>
<ParticipantTile />
</CarouselLayout>
{focusTrack && <FocusLayout trackRef={focusTrack} />}
</FocusLayoutContainer>
</div>
)}
<ControlBar
controls={{ settings: !!SettingsComponent }}
/>
<button
className={`lk-button ${isAlwaysListening ? 'lk-button-active' : ''}`}
onClick={toggleAlwaysListening}
>
{isAlwaysListening ? 'Stop Listening' : 'Start Listening'}
</button>
</div>
{SettingsComponent && (
<div
className="lk-settings-menu-modal"
style={{ display: widgetState.showSettings ? 'block' : 'none' }}
>
<SettingsComponent />
</div>
)}
</LayoutContextProvider>
)}
<RoomAudioRenderer />
<ConnectionStateToast />
</div>
);
}

@ -0,0 +1,20 @@
/**
* @internal
*/
export function isWeb(): boolean {
return typeof document !== 'undefined';
}
/**
* Mobile browser detection based on `navigator.userAgent` string.
* Defaults to returning `false` if not in a browser.
*
* @remarks
* This should only be used if feature detection or other methods do not work!
*
* @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Browser_detection_using_the_user_agent#mobile_device_detection
*/
export function isMobileBrowser(): boolean {
return isWeb() ? /Mobi/i.test(window.navigator.userAgent) : false;
}

@ -0,0 +1,56 @@
import {
setLogLevel as setClientSdkLogLevel,
setLogExtension as setClientSdkLogExtension,
LogLevel as LogLevelEnum,
} from 'livekit-client';
import loglevel from 'loglevel'
export const log = loglevel.getLogger('lk-components-js');
log.setDefaultLevel('WARN');
type LogLevel = Parameters<typeof setClientSdkLogLevel>[0];
type SetLogLevelOptions = {
liveKitClientLogLevel?: LogLevel;
};
/**
* Set the log level for both the `@livekit/components-react` package and the `@livekit-client` package.
* To set the `@livekit-client` log independently, use the `liveKitClientLogLevel` prop on the `options` object.
* @public
*/
export function setLogLevel(level: LogLevel, options: SetLogLevelOptions = {}): void {
log.setLevel(level);
setClientSdkLogLevel(options.liveKitClientLogLevel ?? level);
}
type LogExtension = (level: LogLevel, msg: string, context?: object) => void;
type SetLogExtensionOptions = {
liveKitClientLogExtension?: LogExtension;
};
/**
* Set the log extension for both the `@livekit/components-react` package and the `@livekit-client` package.
* To set the `@livekit-client` log extension, use the `liveKitClientLogExtension` prop on the `options` object.
* @public
*/
export function setLogExtension(extension: LogExtension, options: SetLogExtensionOptions = {}) {
const originalFactory = log.methodFactory;
log.methodFactory = (methodName, configLevel, loggerName) => {
const rawMethod = originalFactory(methodName, configLevel, loggerName);
const logLevel = LogLevelEnum[methodName];
const needLog = logLevel >= configLevel && logLevel < LogLevelEnum.silent;
return (msg, context?: [msg: string, context: object]) => {
if (context) rawMethod(msg, context);
else rawMethod(msg);
if (needLog) {
extension(logLevel, msg, context);
}
};
};
log.setLevel(log.getLevel()); // Be sure to call setLevel method in order to apply plugin
setClientSdkLogExtension(options.liveKitClientLogExtension ?? extension);
}

@ -0,0 +1,87 @@
/*
* Copyright 2020 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
import clsx from 'clsx';
/**
* Calls all functions in the order they were chained with the same arguments.
* @internal
*/
export function chain(...callbacks: any[]): (...args: any[]) => void {
return (...args: any[]) => {
for (const callback of callbacks) {
if (typeof callback === 'function') {
try {
callback(...args);
} catch (e) {
console.error(e);
}
}
}
};
}
interface Props {
[key: string]: any;
}
// taken from: https://stackoverflow.com/questions/51603250/typescript-3-parameter-list-intersection-type/51604379#51604379
type TupleTypes<T> = { [P in keyof T]: T[P] } extends { [key: number]: infer V } ? V : never;
type UnionToIntersection<U> = (U extends any ? (k: U) => void : never) extends (k: infer I) => void
? I
: never;
/**
* Merges multiple props objects together. Event handlers are chained,
* classNames are combined, and ids are deduplicated - different ids
* will trigger a side-effect and re-render components hooked up with `useId`.
* For all other props, the last prop object overrides all previous ones.
* @param args - Multiple sets of props to merge together.
* @internal
*/
export function mergeProps<T extends Props[]>(...args: T): UnionToIntersection<TupleTypes<T>> {
// Start with a base clone of the first argument. This is a lot faster than starting
// with an empty object and adding properties as we go.
const result: Props = { ...args[0] };
for (let i = 1; i < args.length; i++) {
const props = args[i];
for (const key in props) {
const a = result[key];
const b = props[key];
// Chain events
if (
typeof a === 'function' &&
typeof b === 'function' &&
// This is a lot faster than a regex.
key[0] === 'o' &&
key[1] === 'n' &&
key.charCodeAt(2) >= /* 'A' */ 65 &&
key.charCodeAt(2) <= /* 'Z' */ 90
) {
result[key] = chain(a, b);
// Merge classnames, sometimes classNames are empty string which eval to false, so we just need to do a type check
} else if (
(key === 'className' || key === 'UNSAFE_className') &&
typeof a === 'string' &&
typeof b === 'string'
) {
result[key] = clsx(a, b);
} else {
result[key] = b !== undefined ? b : a;
}
}
}
return result as UnionToIntersection<TupleTypes<T>>;
}

@ -0,0 +1,73 @@
/**
* The TrackReference type is a logical grouping of participant publication and/or subscribed track.
*
*/
import type { Participant, Track, TrackPublication } from 'livekit-client';
// ## TrackReference Types
/** @public */
export type TrackReferencePlaceholder = {
participant: Participant;
publication?: never;
source: Track.Source;
};
/** @public */
export type TrackReference = {
participant: Participant;
publication: TrackPublication;
source: Track.Source;
};
/** @public */
export type TrackReferenceOrPlaceholder = TrackReference | TrackReferencePlaceholder;
// ### TrackReference Type Predicates
/** @internal */
export function isTrackReference(trackReference: unknown): trackReference is TrackReference {
if (typeof trackReference === 'undefined') {
return false;
}
return (
isTrackReferenceSubscribed(trackReference as TrackReference) ||
isTrackReferencePublished(trackReference as TrackReference)
);
}
function isTrackReferenceSubscribed(trackReference?: TrackReferenceOrPlaceholder): boolean {
if (!trackReference) {
return false;
}
return (
trackReference.hasOwnProperty('participant') &&
trackReference.hasOwnProperty('source') &&
trackReference.hasOwnProperty('track') &&
typeof trackReference.publication?.track !== 'undefined'
);
}
function isTrackReferencePublished(trackReference?: TrackReferenceOrPlaceholder): boolean {
if (!trackReference) {
return false;
}
return (
trackReference.hasOwnProperty('participant') &&
trackReference.hasOwnProperty('source') &&
trackReference.hasOwnProperty('publication') &&
typeof trackReference.publication !== 'undefined'
);
}
export function isTrackReferencePlaceholder(
trackReference?: TrackReferenceOrPlaceholder,
): trackReference is TrackReferencePlaceholder {
if (!trackReference) {
return false;
}
return (
trackReference.hasOwnProperty('participant') &&
trackReference.hasOwnProperty('source') &&
typeof trackReference.publication === 'undefined'
);
}

@ -0,0 +1,97 @@
import type { Track } from 'livekit-client';
import type { PinState } from './types';
import type { TrackReferenceOrPlaceholder } from './track-reference-types';
import { isTrackReference, isTrackReferencePlaceholder } from './track-reference-types';
/**
* Returns a id to identify the `TrackReference` or `TrackReferencePlaceholder` based on
* participant, track source and trackSid.
* @remarks
* The id pattern is: `${participantIdentity}_${trackSource}_${trackSid}` for `TrackReference`
* and `${participantIdentity}_${trackSource}_placeholder` for `TrackReferencePlaceholder`.
*/
export function getTrackReferenceId(trackReference: TrackReferenceOrPlaceholder | number) {
if (typeof trackReference === 'string' || typeof trackReference === 'number') {
return `${trackReference}`;
} else if (isTrackReferencePlaceholder(trackReference)) {
return `${trackReference.participant.identity}_${trackReference.source}_placeholder`;
} else if (isTrackReference(trackReference)) {
return `${trackReference.participant.identity}_${trackReference.publication.source}_${trackReference.publication.trackSid}`;
} else {
throw new Error(`Can't generate a id for the given track reference: ${trackReference}`);
}
}
export type TrackReferenceId = ReturnType<typeof getTrackReferenceId>;
/** Returns the Source of the TrackReference. */
export function getTrackReferenceSource(trackReference: TrackReferenceOrPlaceholder): Track.Source {
if (isTrackReference(trackReference)) {
return trackReference.publication.source;
} else {
return trackReference.source;
}
}
export function isEqualTrackRef(
a?: TrackReferenceOrPlaceholder,
b?: TrackReferenceOrPlaceholder,
): boolean {
if (a === undefined || b === undefined) {
return false;
}
if (isTrackReference(a) && isTrackReference(b)) {
return a.publication.trackSid === b.publication.trackSid;
} else {
return getTrackReferenceId(a) === getTrackReferenceId(b);
}
}
/**
* Check if the `TrackReference` is pinned.
*/
export function isTrackReferencePinned(
trackReference: TrackReferenceOrPlaceholder,
pinState: PinState | undefined,
): boolean {
if (typeof pinState === 'undefined') {
return false;
}
if (isTrackReference(trackReference)) {
return pinState.some(
(pinnedTrackReference) =>
pinnedTrackReference.participant.identity === trackReference.participant.identity &&
isTrackReference(pinnedTrackReference) &&
pinnedTrackReference.publication.trackSid === trackReference.publication.trackSid,
);
} else if (isTrackReferencePlaceholder(trackReference)) {
return pinState.some(
(pinnedTrackReference) =>
pinnedTrackReference.participant.identity === trackReference.participant.identity &&
isTrackReferencePlaceholder(pinnedTrackReference) &&
pinnedTrackReference.source === trackReference.source,
);
} else {
return false;
}
}
/**
* Check if the current `currentTrackRef` is the placeholder for next `nextTrackRef`.
* Based on the participant identity and the source.
* @internal
*/
export function isPlaceholderReplacement(
currentTrackRef: TrackReferenceOrPlaceholder,
nextTrackRef: TrackReferenceOrPlaceholder,
) {
// if (typeof nextTrackRef === 'number' || typeof currentTrackRef === 'number') {
// return false;
// }
return (
isTrackReferencePlaceholder(currentTrackRef) &&
isTrackReference(nextTrackRef) &&
nextTrackRef.participant.identity === currentTrackRef.participant.identity &&
nextTrackRef.source === currentTrackRef.source
);
}

@ -0,0 +1,90 @@
import type { Participant, ParticipantKind, Track, TrackPublication } from 'livekit-client';
import type { TrackReference, TrackReferenceOrPlaceholder } from './track-reference-types';
// ## PinState Type
/** @public */
export type PinState = TrackReferenceOrPlaceholder[];
export const PIN_DEFAULT_STATE: PinState = [];
// ## WidgetState Types
/** @public */
export type WidgetState = {
showChat: boolean;
unreadMessages: number;
showSettings?: boolean;
};
export const WIDGET_DEFAULT_STATE: WidgetState = {
showChat: false,
unreadMessages: 0,
showSettings: false,
};
// ## Track Source Types
export type TrackSourceWithOptions = { source: Track.Source; withPlaceholder: boolean };
export type SourcesArray = Track.Source[] | TrackSourceWithOptions[];
// ### Track Source Type Predicates
export function isSourceWitOptions(source: SourcesArray[number]): source is TrackSourceWithOptions {
return typeof source === 'object';
}
export function isSourcesWithOptions(sources: SourcesArray): sources is TrackSourceWithOptions[] {
return (
Array.isArray(sources) &&
(sources as TrackSourceWithOptions[]).filter(isSourceWitOptions).length > 0
);
}
// ## Loop Filter Types
export type TrackReferenceFilter = Parameters<TrackReferenceOrPlaceholder[]['filter']>['0'];
export type ParticipantFilter = Parameters<Participant[]['filter']>['0'];
// ## Other Types
/** @internal */
export interface ParticipantClickEvent {
participant: Participant;
track?: TrackPublication;
}
export type TrackSource<T extends Track.Source> = RequireAtLeastOne<
{ source: T; name: string; participant: Participant },
'name' | 'source'
>;
export type ParticipantTrackIdentifier = RequireAtLeastOne<
{ sources: Track.Source[]; name: string; kind: Track.Kind },
'sources' | 'name' | 'kind'
>;
/**
* @beta
*/
export type ParticipantIdentifier = RequireAtLeastOne<
{ kind: ParticipantKind; identity: string },
'identity' | 'kind'
>;
/**
* The TrackIdentifier type is used to select Tracks either based on
* - Track.Source and/or name of the track, e.g. `{source: Track.Source.Camera}` or `{name: "my-track"}`
* - TrackReference (participant and publication)
* @internal
*/
export type TrackIdentifier<T extends Track.Source = Track.Source> =
| TrackSource<T>
| TrackReference;
// ## Util Types
type RequireAtLeastOne<T, Keys extends keyof T = keyof T> = Pick<T, Exclude<keyof T, Keys>> &
{
[K in Keys]-?: Required<Pick<T, K>> & Partial<Pick<T, Exclude<Keys, K>>>;
}[Keys];
export type RequireOnlyOne<T, Keys extends keyof T = keyof T> = Pick<T, Exclude<keyof T, Keys>> &
{
[K in Keys]-?: Required<Pick<T, K>> & Partial<Record<Exclude<Keys, K>, undefined>>;
}[Keys];
export type AudioSource = Track.Source.Microphone | Track.Source.ScreenShareAudio;
export type VideoSource = Track.Source.Camera | Track.Source.ScreenShare;

@ -0,0 +1,11 @@
import * as React from 'react';
import { warnAboutMissingStyles } from './utils';
/**
* @internal
*/
export function useWarnAboutMissingStyles() {
React.useEffect(() => {
warnAboutMissingStyles();
}, []);
}

@ -0,0 +1,62 @@
import * as React from 'react';
import { mergeProps as mergePropsReactAria } from './mergeProps'
import { log } from './logger';
import clsx from 'clsx';
/** @internal */
export function isProp<U extends HTMLElement, T extends React.HTMLAttributes<U>>(
prop: T | undefined,
): prop is T {
return prop !== undefined;
}
/** @internal */
export function mergeProps<
U extends HTMLElement,
T extends Array<React.HTMLAttributes<U> | undefined>,
>(...props: T) {
return mergePropsReactAria(...props.filter(isProp));
}
/** @internal */
export function cloneSingleChild(
children: React.ReactNode | React.ReactNode[],
props?: Record<string, any>,
key?: any,
) {
return React.Children.map(children, (child) => {
// Checking isValidElement is the safe way and avoids a typescript
// error too.
if (React.isValidElement(child) && React.Children.only(children)) {
if (child.props.class) {
// make sure we retain classnames of both passed props and child
props ??= {};
props.class = clsx(child.props.class, props.class);
props.style = { ...child.props.style, ...props.style };
}
return React.cloneElement(child, { ...props, key });
}
return child;
});
}
/**
* @internal
*/
export function warnAboutMissingStyles(el?: HTMLElement) {
if (
typeof window !== 'undefined' &&
typeof process !== 'undefined' &&
// eslint-disable-next-line turbo/no-undeclared-env-vars
(process?.env?.NODE_ENV === 'dev' ||
// eslint-disable-next-line turbo/no-undeclared-env-vars
process?.env?.NODE_ENV === 'development')
) {
const target = el ?? document.querySelector('.lk-room-container');
if (target && !getComputedStyle(target).getPropertyValue('--lk-has-imported-styles')) {
log.warn(
"It looks like you're not using the `@livekit/components-styles package`. To render the UI with the default styling, please import it in your layout or page.",
);
}
}
}

@ -0,0 +1,79 @@
'use client';
import { formatChatMessageLinks, LiveKitRoom } from '@livekit/components-react';
import {
ExternalE2EEKeyProvider,
LogLevel,
Room,
RoomConnectOptions,
RoomOptions,
VideoPresets,
type VideoCodec,
} from 'livekit-client';
import { DebugMode } from '@/lib/Debug';
import { useMemo } from 'react';
import { decodePassphrase } from '@/lib/client-utils';
import { SettingsMenu } from '@/lib/SettingsMenu';
import { VideoConference } from '../components/VideoConference'
export function VideoConferenceClientImpl(props: {
liveKitUrl: string;
token: string;
codec: VideoCodec | undefined;
}) {
const worker =
typeof window !== 'undefined' &&
new Worker(new URL('livekit-client/e2ee-worker', import.meta.url));
const keyProvider = new ExternalE2EEKeyProvider();
const e2eePassphrase =
typeof window !== 'undefined' ? decodePassphrase(window.location.hash.substring(1)) : undefined;
const e2eeEnabled = !!(e2eePassphrase && worker);
const roomOptions = useMemo((): RoomOptions => {
return {
publishDefaults: {
videoSimulcastLayers: [VideoPresets.h540, VideoPresets.h216],
red: !e2eeEnabled,
videoCodec: props.codec,
},
adaptiveStream: { pixelDensity: 'screen' },
dynacast: true,
e2ee: e2eeEnabled
? {
keyProvider,
worker,
}
: undefined,
};
}, []);
const room = useMemo(() => new Room(roomOptions), []);
if (e2eeEnabled) {
keyProvider.setKey(e2eePassphrase);
room.setE2EEEnabled(true);
}
const connectOptions = useMemo((): RoomConnectOptions => {
return {
autoSubscribe: true,
};
}, []);
return (
<LiveKitRoom
room={room}
token={props.token}
connectOptions={connectOptions}
serverUrl={props.liveKitUrl}
audio={true}
video={true}
>
<VideoConference
chatMessageFormatter={formatChatMessageLinks}
SettingsComponent={
process.env.NEXT_PUBLIC_SHOW_SETTINGS_MENU === 'true' ? SettingsMenu : undefined
}
/>
<DebugMode logLevel={LogLevel.debug} />
</LiveKitRoom>
);
}

@ -0,0 +1,28 @@
import { videoCodecs } from 'livekit-client';
import { VideoConferenceClientImpl } from './VideoConferenceClientImpl';
import { isVideoCodec } from '@/lib/types';
export default function CustomRoomConnection(props: {
searchParams: {
liveKitUrl?: string;
token?: string;
codec?: string;
};
}) {
const { liveKitUrl, token, codec } = props.searchParams;
if (typeof liveKitUrl !== 'string') {
return <h2>Missing LiveKit URL</h2>;
}
if (typeof token !== 'string') {
return <h2>Missing LiveKit token</h2>;
}
if (codec !== undefined && !isVideoCodec(codec)) {
return <h2>Invalid codec, if defined it has to be [{videoCodecs.join(', ')}].</h2>;
}
return (
<main data-lk-theme="default" style={{ height: '100%' }}>
<VideoConferenceClientImpl liveKitUrl={liveKitUrl} token={token} codec={codec} />
</main>
);
}

@ -0,0 +1,56 @@
import '../styles/globals.css';
import '@livekit/components-styles';
import '@livekit/components-styles/prefabs';
import type { Metadata, Viewport } from 'next';
export const metadata: Metadata = {
title: {
default: 'LiveKit Meet | Conference app build with LiveKit open source',
template: '%s',
},
description:
'LiveKit is an open source WebRTC project that gives you everything needed to build scalable and real-time audio and/or video experiences in your applications.',
twitter: {
creator: '@livekitted',
site: '@livekitted',
card: 'summary_large_image',
},
openGraph: {
url: 'https://meet.livekit.io',
images: [
{
url: 'https://meet.livekit.io/images/livekit-meet-open-graph.png',
width: 2000,
height: 1000,
type: 'image/png',
},
],
siteName: 'LiveKit Meet',
},
icons: {
icon: {
rel: 'icon',
url: '/favicon.ico',
},
apple: [
{
rel: 'apple-touch-icon',
url: '/images/livekit-apple-touch.png',
sizes: '180x180',
},
{ rel: 'mask-icon', url: '/images/livekit-safari-pinned-tab.svg', color: '#070707' },
],
},
};
export const viewport: Viewport = {
themeColor: '#070707',
};
export default function RootLayout({ children }: { children: React.ReactNode }) {
return (
<html lang="en">
<body>{children}</body>
</html>
);
}

@ -0,0 +1,201 @@
'use client';
import { useRouter, useSearchParams } from 'next/navigation';
import React, { Suspense, useState } from 'react';
import { encodePassphrase, generateRoomId, randomString } from '@/lib/client-utils';
import styles from '../styles/Home.module.css';
function Tabs(props: React.PropsWithChildren<{}>) {
const searchParams = useSearchParams();
const tabIndex = searchParams?.get('tab') === 'custom' ? 1 : 0;
const router = useRouter();
function onTabSelected(index: number) {
const tab = index === 1 ? 'custom' : 'demo';
router.push(`/?tab=${tab}`);
}
let tabs = React.Children.map(props.children, (child, index) => {
return (
<button
className="lk-button"
onClick={() => {
if (onTabSelected) {
onTabSelected(index);
}
}}
aria-pressed={tabIndex === index}
>
{/* @ts-ignore */}
{child?.props.label}
</button>
);
});
return (
<div className={styles.tabContainer}>
<div className={styles.tabSelect}>{tabs}</div>
{/* @ts-ignore */}
{props.children[tabIndex]}
</div>
);
}
function DemoMeetingTab(props: { label: string }) {
const router = useRouter();
const [e2ee, setE2ee] = useState(false);
const [sharedPassphrase, setSharedPassphrase] = useState(randomString(64));
const startMeeting = () => {
if (e2ee) {
router.push(`/rooms/${generateRoomId()}#${encodePassphrase(sharedPassphrase)}`);
} else {
router.push(`/rooms/${generateRoomId()}`);
}
};
return (
<div className={styles.tabContent}>
<p style={{ margin: 0 }}>Try LiveKit Meet for free with our live demo project.</p>
<button style={{ marginTop: '1rem' }} className="lk-button" onClick={startMeeting}>
Start Meeting
</button>
<div style={{ display: 'flex', flexDirection: 'column', gap: '1rem' }}>
<div style={{ display: 'flex', flexDirection: 'row', gap: '1rem' }}>
<input
id="use-e2ee"
type="checkbox"
checked={e2ee}
onChange={(ev) => setE2ee(ev.target.checked)}
></input>
<label htmlFor="use-e2ee">Enable end-to-end encryption</label>
</div>
{e2ee && (
<div style={{ display: 'flex', flexDirection: 'row', gap: '1rem' }}>
<label htmlFor="passphrase">Passphrase</label>
<input
id="passphrase"
type="password"
value={sharedPassphrase}
onChange={(ev) => setSharedPassphrase(ev.target.value)}
/>
</div>
)}
</div>
</div>
);
}
function CustomConnectionTab(props: { label: string }) {
const router = useRouter();
const [e2ee, setE2ee] = useState(false);
const [sharedPassphrase, setSharedPassphrase] = useState(randomString(64));
const onSubmit: React.FormEventHandler<HTMLFormElement> = (event) => {
event.preventDefault();
const formData = new FormData(event.target as HTMLFormElement);
const serverUrl = formData.get('serverUrl');
const token = formData.get('token');
if (e2ee) {
router.push(
`/custom/?liveKitUrl=${serverUrl}&token=${token}#${encodePassphrase(sharedPassphrase)}`,
);
} else {
router.push(`/custom/?liveKitUrl=${serverUrl}&token=${token}`);
}
};
return (
<form className={styles.tabContent} onSubmit={onSubmit}>
<p style={{ marginTop: 0 }}>
Connect LiveKit Meet with a custom server using LiveKit Cloud or LiveKit Server.
</p>
<input
id="serverUrl"
name="serverUrl"
type="url"
placeholder="LiveKit Server URL: wss://*.livekit.cloud"
required
/>
<textarea
id="token"
name="token"
placeholder="Token"
required
rows={5}
style={{ padding: '1px 2px', fontSize: 'inherit', lineHeight: 'inherit' }}
/>
<div style={{ display: 'flex', flexDirection: 'column', gap: '1rem' }}>
<div style={{ display: 'flex', flexDirection: 'row', gap: '1rem' }}>
<input
id="use-e2ee"
type="checkbox"
checked={e2ee}
onChange={(ev) => setE2ee(ev.target.checked)}
></input>
<label htmlFor="use-e2ee">Enable end-to-end encryption</label>
</div>
{e2ee && (
<div style={{ display: 'flex', flexDirection: 'row', gap: '1rem' }}>
<label htmlFor="passphrase">Passphrase</label>
<input
id="passphrase"
type="password"
value={sharedPassphrase}
onChange={(ev) => setSharedPassphrase(ev.target.value)}
/>
</div>
)}
</div>
<hr
style={{ width: '100%', borderColor: 'rgba(255, 255, 255, 0.15)', marginBlock: '1rem' }}
/>
<button
style={{ paddingInline: '1.25rem', width: '100%' }}
className="lk-button"
type="submit"
>
Connect
</button>
</form>
);
}
export default function Page() {
return (
<>
<main className={styles.main} data-lk-theme="default">
<div className="header">
<img src="/images/livekit-meet-home.svg" alt="LiveKit Meet" width="360" height="45" />
<h2>
Open source video conferencing app built on{' '}
<a href="https://github.com/livekit/components-js?ref=meet" rel="noopener">
LiveKit&nbsp;Components
</a>
,{' '}
<a href="https://livekit.io/cloud?ref=meet" rel="noopener">
LiveKit&nbsp;Cloud
</a>{' '}
and Next.js.
</h2>
</div>
<Suspense fallback="Loading">
<Tabs>
<DemoMeetingTab label="Demo" />
<CustomConnectionTab label="Custom" />
</Tabs>
</Suspense>
</main>
<footer data-lk-theme="default">
Hosted on{' '}
<a href="https://livekit.io/cloud?ref=meet" rel="noopener">
LiveKit Cloud
</a>
. Source code on{' '}
<a href="https://github.com/livekit/meet?ref=meet" rel="noopener">
GitHub
</a>
.
</footer>
</>
);
}

@ -0,0 +1,203 @@
'use client';
import { decodePassphrase } from '@/lib/client-utils';
import { DebugMode } from '@/lib/Debug';
import { RecordingIndicator } from '@/lib/RecordingIndicator';
import { SettingsMenu } from '@/lib/SettingsMenu';
import { ConnectionDetails } from '@/lib/types';
import {
formatChatMessageLinks,
LiveKitRoom,
LocalUserChoices,
PreJoin,
VideoConference,
} from '@livekit/components-react';
import {
ExternalE2EEKeyProvider,
RoomOptions,
VideoCodec,
VideoPresets,
Room,
DeviceUnsupportedError,
RoomConnectOptions,
} from 'livekit-client';
import { useRouter } from 'next/navigation';
import React from 'react';
const CONN_DETAILS_ENDPOINT =
process.env.NEXT_PUBLIC_CONN_DETAILS_ENDPOINT ?? '/api/connection-details';
const SHOW_SETTINGS_MENU = process.env.NEXT_PUBLIC_SHOW_SETTINGS_MENU == 'true';
export function PageClientImpl(props: {
roomName: string;
region?: string;
hq: boolean;
codec: VideoCodec;
}) {
const [preJoinChoices, setPreJoinChoices] = React.useState<LocalUserChoices | undefined>(
undefined,
);
const preJoinDefaults = React.useMemo(() => {
return {
username: '',
videoEnabled: true,
audioEnabled: true,
};
}, []);
const [connectionDetails, setConnectionDetails] = React.useState<ConnectionDetails | undefined>(
undefined,
);
const handlePreJoinSubmit = React.useCallback(async (values: LocalUserChoices) => {
setPreJoinChoices(values);
const url = new URL(CONN_DETAILS_ENDPOINT, window.location.origin);
url.searchParams.append('roomName', props.roomName);
url.searchParams.append('participantName', values.username);
if (props.region) {
url.searchParams.append('region', props.region);
}
const connectionDetailsResp = await fetch(url.toString());
const connectionDetailsData = await connectionDetailsResp.json();
setConnectionDetails(connectionDetailsData);
}, []);
const handlePreJoinError = React.useCallback((e: any) => console.error(e), []);
return (
<main data-lk-theme="default" style={{ height: '100%' }}>
{connectionDetails === undefined || preJoinChoices === undefined ? (
<div style={{ display: 'grid', placeItems: 'center', height: '100%' }}>
<PreJoin
defaults={preJoinDefaults}
onSubmit={handlePreJoinSubmit}
onError={handlePreJoinError}
/>
</div>
) : (
<VideoConferenceComponent
connectionDetails={connectionDetails}
userChoices={preJoinChoices}
options={{ codec: props.codec, hq: props.hq }}
/>
)}
</main>
);
}
function VideoConferenceComponent(props: {
userChoices: LocalUserChoices;
connectionDetails: ConnectionDetails;
options: {
hq: boolean;
codec: VideoCodec;
};
}) {
const e2eePassphrase =
typeof window !== 'undefined' && decodePassphrase(location.hash.substring(1));
const worker =
typeof window !== 'undefined' &&
e2eePassphrase &&
new Worker(new URL('livekit-client/e2ee-worker', import.meta.url));
const e2eeEnabled = !!(e2eePassphrase && worker);
const keyProvider = new ExternalE2EEKeyProvider();
const [e2eeSetupComplete, setE2eeSetupComplete] = React.useState(false);
const roomOptions = React.useMemo((): RoomOptions => {
let videoCodec: VideoCodec | undefined = props.options.codec ? props.options.codec : 'vp9';
if (e2eeEnabled && (videoCodec === 'av1' || videoCodec === 'vp9')) {
videoCodec = undefined;
}
return {
videoCaptureDefaults: {
deviceId: props.userChoices.videoDeviceId ?? undefined,
resolution: props.options.hq ? VideoPresets.h2160 : VideoPresets.h720,
},
publishDefaults: {
dtx: false,
videoSimulcastLayers: props.options.hq
? [VideoPresets.h1080, VideoPresets.h720]
: [VideoPresets.h540, VideoPresets.h216],
red: !e2eeEnabled,
videoCodec,
},
audioCaptureDefaults: {
deviceId: props.userChoices.audioDeviceId ?? undefined,
},
adaptiveStream: { pixelDensity: 'screen' },
dynacast: true,
e2ee: e2eeEnabled
? {
keyProvider,
worker,
}
: undefined,
};
}, [props.userChoices, props.options.hq, props.options.codec]);
const room = React.useMemo(() => new Room(roomOptions), []);
React.useEffect(() => {
if (e2eeEnabled) {
keyProvider
.setKey(decodePassphrase(e2eePassphrase))
.then(() => {
room.setE2EEEnabled(true).catch((e) => {
if (e instanceof DeviceUnsupportedError) {
alert(
`You're trying to join an encrypted meeting, but your browser does not support it. Please update it to the latest version and try again.`,
);
console.error(e);
} else {
throw e;
}
});
})
.then(() => setE2eeSetupComplete(true));
} else {
setE2eeSetupComplete(true);
}
}, [e2eeEnabled, room, e2eePassphrase]);
const connectOptions = React.useMemo((): RoomConnectOptions => {
return {
autoSubscribe: true,
};
}, []);
const router = useRouter();
const handleOnLeave = React.useCallback(() => router.push('/'), [router]);
const handleError = React.useCallback((error: Error) => {
console.error(error);
alert(`Encountered an unexpected error, check the console logs for details: ${error.message}`);
}, []);
const handleEncryptionError = React.useCallback((error: Error) => {
console.error(error);
alert(
`Encountered an unexpected encryption error, check the console logs for details: ${error.message}`,
);
}, []);
return (
<>
<LiveKitRoom
connect={e2eeSetupComplete}
room={room}
token={props.connectionDetails.participantToken}
serverUrl={props.connectionDetails.serverUrl}
connectOptions={connectOptions}
video={props.userChoices.videoEnabled}
audio={props.userChoices.audioEnabled}
onDisconnected={handleOnLeave}
onEncryptionError={handleEncryptionError}
onError={handleError}
>
<VideoConference
chatMessageFormatter={formatChatMessageLinks}
SettingsComponent={SHOW_SETTINGS_MENU ? SettingsMenu : undefined}
/>
<DebugMode />
<RecordingIndicator />
</LiveKitRoom>
</>
);
}

@ -0,0 +1,26 @@
import * as React from 'react';
import { PageClientImpl } from './PageClientImpl';
import { isVideoCodec } from '@/lib/types';
export default function Page({
params,
searchParams,
}: {
params: { roomName: string };
searchParams: {
// FIXME: We should not allow values for regions if in playground mode.
region?: string;
hq?: string;
codec?: string;
};
}) {
const codec =
typeof searchParams.codec === 'string' && isVideoCodec(searchParams.codec)
? searchParams.codec
: 'vp9';
const hq = searchParams.hq === 'true' ? true : false;
return (
<PageClientImpl roomName={params.roomName} region={searchParams.region} hq={hq} codec={codec} />
);
}

@ -0,0 +1,5 @@
/// <reference types="next" />
/// <reference types="next/image-types/global" />
// NOTE: This file should not be edited
// see https://nextjs.org/docs/app/building-your-application/configuring/typescript for more information.

@ -0,0 +1,16 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
reactStrictMode: false,
productionBrowserSourceMaps: true,
webpack: (config, { buildId, dev, isServer, defaultLoaders, nextRuntime, webpack }) => {
// Important: return the modified config
config.module.rules.push({
test: /\.mjs$/,
enforce: 'pre',
use: ['source-map-loader'],
});
return config;
},
};
module.exports = nextConfig;

@ -0,0 +1,37 @@
{
"name": "livekit-meet",
"version": "0.2.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@datadog/browser-logs": "^5.23.3",
"@livekit/components-react": "2.6.9",
"@livekit/components-styles": "1.1.4",
"@livekit/krisp-noise-filter": "0.2.13",
"clsx": "^2.1.1",
"livekit-client": "2.7.3",
"livekit-server-sdk": "2.9.3",
"loglevel": "^1.9.2",
"next": "14.2.18",
"react": "18.3.1",
"react-dom": "18.3.1",
"tinykeys": "^3.0.0"
},
"devDependencies": {
"@types/node": "22.9.0",
"@types/react": "18.3.12",
"@types/react-dom": "18.3.1",
"eslint": "9.14.0",
"eslint-config-next": "14.2.18",
"source-map-loader": "^5.0.0",
"typescript": "5.6.3"
},
"engines": {
"node": ">=18"
}
}

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 323 B

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="961" height="121" fill="none"><path fill="#fff" d="M20.2 0H0v118.1h73.4V101H20.2V0ZM106.9 53.8H87.2V118H107V53.8ZM164.6 115.8l-25-81.5H120l26.2 83.8H183l26.2-83.8h-19.8l-24.8 81.5ZM257.8 32.5c-25.4 0-41.6 18-41.6 43.7 0 25.5 15.7 43.8 41.6 43.8 19.8 0 34-8.7 39.4-26.6h-20c-3 8.1-8.4 13-19.2 13-12 0-20.3-8.3-21.9-24.5h62.5c.3-2 .5-4.1.5-6.2 0-26.1-16.3-43.2-41.3-43.2Zm-21.5 35.9c2-15 10-22.2 21.5-22.2 12.1 0 20.3 8.8 21.2 22.2h-42.7ZM413.8 0h-25.5l-49.2 54V0h-20.3v118.1h20.3V58.4l54.3 59.7h25.9L362.5 56l51.3-56ZM447.7 34.3H428v64.4h19.7V34.3ZM87.2 34.3H67.6v19.5h19.6V34.3ZM467.3 98.7h-19.6v19.4h19.6V98.7ZM525.9 98.7h-19.6v19.4h19.6V98.7ZM525.9 53.8V34.3h-19.6V0h-19.7v34.3H467v19.5h19.6v44.9h19.7v-45h19.6Z"/><path fill="#FF6352" d="M589.8 119V.4h-10.7V119h10.7Zm53.9 0L602.3.4H591L632.4 119h11.3Zm12.3 0L697.3.4h-11.2L644.7 119H656Zm53.2 0V.4h-10.6V119h10.6Zm99.4-42.9c0-25.6-16.4-41.8-38.4-41.8-23 0-38.7 17.5-38.7 43.2 0 25.9 15.6 43.2 39.2 43.2 18 0 31.3-8.4 36.2-26h-10.6c-3.6 11.4-11.7 18.2-25.6 18.2-16.4 0-27.8-11.8-28.7-32.7h66.3c.1-1.8.3-2.7.3-4.1Zm-38.4-34c16 0 26.8 12.8 27.8 30.1H742c1.7-18.9 12.4-30.1 28.1-30.1Zm130.4 34c0-25.6-16.4-41.8-38.4-41.8-23 0-38.7 17.5-38.7 43.2 0 25.9 15.6 43.2 39.2 43.2 18 0 31.3-8.4 36.2-26h-10.6c-3.6 11.4-11.7 18.2-25.6 18.2-16.4 0-27.8-11.8-28.7-32.7h66.3c.1-1.8.3-2.7.3-4.1Zm-38.4-34c16 0 26.9 12.8 27.8 30.1H834c1.8-18.9 12.4-30.1 28.1-30.1Zm88.3 69c-8.7 0-13.5-3.5-13.5-13.2V44h22.9v-8h-23V16.4h-10.2V36H908v8h18.6v53.9c0 14.4 9.3 21.1 22.9 21.1H960v-8h-9.5Z"/></svg>

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="512" height="512" fill="none"><path fill="#000" fill-rule="evenodd" d="M0 0h512v512H0zm288 224h-64v64h64v64H160V96H96v320h192v-64h64v64h64v-64h-64v-64h-64v-64h64v-64h64V96h-64v64h-64z" clip-rule="evenodd"/></svg>

After

Width:  |  Height:  |  Size: 259 B

@ -0,0 +1,17 @@
{
"extends": ["config:base"],
"packageRules": [
{
"schedule": "before 6am on the first day of the month",
"matchDepTypes": ["devDependencies"],
"matchUpdateTypes": ["patch", "minor"],
"groupName": "devDependencies (non-major)"
},
{
"matchSourceUrlPrefixes": ["https://github.com/livekit/"],
"rangeStrategy": "replace",
"groupName": "LiveKit dependencies (non-major)",
"automerge": true
}
]
}

@ -0,0 +1,16 @@
.overlay {
position: absolute;
top: 0;
background: rgba(0, 0, 0, 0.6);
padding: 1rem;
max-height: min(100%, 100vh);
overflow-y: auto;
}
.detailsSection {
padding-left: 1rem;
}
.detailsSection > div {
padding-left: 1rem;
}

@ -0,0 +1,40 @@
.main {
position: relative;
display: grid;
gap: 1rem;
justify-content: center;
place-content: center;
justify-items: center;
overflow: auto;
flex-grow: 1;
}
.tabContainer {
width: 100%;
max-width: 500px;
padding-inline: 2rem;
}
.tabSelect {
display: flex;
justify-content: stretch;
gap: 0.125rem;
padding: 0.125rem;
margin: 0 auto 1.5rem;
border: 1px solid rgba(255, 255, 255, 0.15);
border-radius: 0.5rem;
}
.tabSelect > * {
width: 100%;
}
.tabContent {
display: flex;
justify-content: center;
flex-direction: column;
gap: 0.75rem;
padding: 1.5rem;
border: 1px solid rgba(255, 255, 255, 0.15);
border-radius: 0.5rem;
}

@ -0,0 +1,23 @@
.settingsCloseButton {
position: absolute;
right: var(--lk-grid-gap);
bottom: var(--lk-grid-gap);
}
.tabs {
position: relative;
display: flex;
align-content: space-between;
}
.tabs > .tab {
padding: 0.5rem;
border-radius: 0;
padding-bottom: 0.5rem;
border-bottom: 3px solid;
border-color: var(--bg5);
}
.tabs > .tab[aria-pressed='true'] {
border-color: var(--lk-accent-bg);
}

@ -0,0 +1,67 @@
* {
box-sizing: border-box;
}
html {
color-scheme: dark;
background-color: #111;
}
html,
body {
overflow: hidden;
width: 100%;
height: 100%;
margin: 0px;
}
body {
display: flex;
flex-direction: column;
}
.header {
max-width: 500px;
padding-inline: 2rem;
}
.header > img {
display: block;
margin: auto;
max-width: 100%;
}
.header > h2 {
font-family: 'TWK Everett', sans-serif;
font-style: normal;
font-weight: 400;
font-size: 1.25rem;
line-height: 144%;
text-align: center;
color: rgba(255, 255, 255, 0.6);
}
footer {
width: 100%;
padding: 1.5rem 2rem;
text-align: center;
color: rgba(255, 255, 255, 0.6);
background-color: var(--lk-bg);
border-top: 1px solid rgba(255, 255, 255, 0.15);
}
footer a,
h2 a {
color: #ff6352;
text-decoration-color: #a33529;
text-underline-offset: 0.125em;
}
footer a:hover,
h2 a {
text-decoration-color: #ff6352;
}
h2 a {
text-decoration: none;
}

@ -0,0 +1,29 @@
{
"compilerOptions": {
"target": "ES2020",
"lib": ["dom", "dom.iterable", "ES2020"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noEmit": true,
"esModuleInterop": true,
"module": "ES2020",
"moduleResolution": "Bundler",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"sourceMap": true,
"plugins": [
{
"name": "next"
}
],
"paths": {
"@/*": ["./*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
"exclude": ["node_modules"]
}

@ -0,0 +1,182 @@
from typing import Any, Dict
import json
import base64
import traceback
import io
import re
from PIL import Image as PIL_Image
from openai import OpenAI
from livekit.agents.llm import ChatContext
from livekit import rtc
from livekit.agents.pipeline import VoicePipelineAgent
from livekit.agents.llm.chat_context import ChatContext
from source.server.livekit.logger import log_message
from livekit.agents.llm.chat_context import ChatImage
# Add these constants after the existing ones
INSTRUCTIONS_PROMPT = """Given the conversation context and the current video frame, evaluate if any instructions have been violated.
Rate the severity of violation from 0-10, where 10 is most severe.
Instructions to check:
1. Ensure that there is no one in the frame.
"""
RESPONSE_FORMAT = """
Respond in the following JSON format:
{
"violation_detected": boolean,
"severity_rating": number,
"violation_summary": string,
"recommendations": string
}
"""
# Add this function to handle safety check callbacks
async def handle_instruction_check(
assistant: VoicePipelineAgent,
video_frame: rtc.VideoFrame,
):
"""Handle safety check callback from video processor"""
log_message("Starting instruction check process...")
try:
log_message("Calling check_instruction_violation...")
result = await check_instruction_violation(
chat_ctx=assistant.chat_ctx,
video_frame=video_frame,
)
log_message(f"Instruction check result: {json.dumps(result, indent=2)}")
if result["violation_detected"] and result["severity_rating"] >= 7:
log_message(f"Violation detected with severity {result['severity_rating']}, triggering assistant response")
# Append violation to chat context
violation_text = f"Instruction violation frame detected: {result['violation_summary']}\nRecommendations: {result['recommendations']}"
assistant.chat_ctx.append(
role="user",
text=violation_text
)
assistant.chat_ctx.append(
role="user",
images=[
ChatImage(image=video_frame)
]
)
log_message(f"Added violation to chat context: {violation_text}")
log_message(f"Current chat context: {assistant.chat_ctx}")
# Trigger assistant response
log_message(f"Triggering assistant response...")
# TODO: instead of saying the predetermined response, we'll trigger an assistant response here
# we can append the current video frame that triggered the violation to the chat context
# NOTE: this currently produces an unexpected connection error:
# httpcore.ConnectError: All connection attempts failed
# stream = assistant.llm.chat(
# chat_ctx=assistant.chat_ctx,
# fnc_ctx=assistant.fnc_ctx,
# )
# we temporarily default back to saying the predetermined response
await assistant.say(violation_text)
else:
log_message("No significant violations detected or severity below threshold")
except Exception as e:
log_message(f"Error in handle_instruction_check: {str(e)}")
log_message(f"Error traceback: {traceback.format_exc()}")
# Add this function to handle safety check callbacks
async def check_instruction_violation(
chat_ctx: ChatContext,
video_frame: rtc.VideoFrame,
) -> Dict[str, Any]:
"""Makes a call to gpt-4o-mini to check for instruction violations"""
log_message("Creating new context for instruction check...")
try:
client = OpenAI()
try:
# Get raw RGBA data
frame_data = video_frame.data.tobytes()
# Create PIL Image from RGBA data
image = PIL_Image.frombytes('RGBA', (video_frame.width, video_frame.height), frame_data)
# Convert RGBA to RGB
rgb_image = image.convert('RGB')
# Save as JPEG
buffer = io.BytesIO()
rgb_image.save(buffer, format='JPEG')
jpeg_bytes = buffer.getvalue()
log_message(f"Got frame data, size: {len(jpeg_bytes)} bytes")
base64_image = base64.b64encode(jpeg_bytes).decode("utf-8")
log_message("Successfully encoded frame to base64")
except Exception as e:
log_message(f"Error encoding frame: {str(e)}")
raise
# Get the response
log_message("Making call to LLM for instruction check...")
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
# TODO: append chat context to prompt without images -- we'll need to parse them out
{
"role": "user",
"content": [
{"type": "text", "text": INSTRUCTIONS_PROMPT + RESPONSE_FORMAT},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
}
],
max_tokens=300,
)
log_message(f"Raw LLM response: {response}")
except Exception as e:
log_message(f"Error making LLM call: {str(e)}")
raise
try:
# Parse the response content
# Clean up the LLM response if it includes ```json ... ```
content = response.choices[0].message.content.strip()
content = re.sub(r'^```(?:json)?', '', content) # remove leading triple backticks and optional 'json'
content = re.sub(r'```$', '', content).strip() # remove trailing triple backticks
result = json.loads(content)
log_message(f"Successfully parsed LLM response: {json.dumps(result, indent=2)}")
return result
except Exception as e:
log_message(f"Error parsing LLM response: {str(e)}")
raise
except Exception as e:
log_message(f"Failed to process instruction check: {str(e)}")
log_message(f"Error traceback: {traceback.format_exc()}")
default_response = {
"violation_detected": False,
"severity_rating": 0,
"violation_summary": f"Error processing instruction check: {str(e)}",
"recommendations": "None"
}
log_message(f"Returning default response: {json.dumps(default_response, indent=2)}")
return default_response

@ -0,0 +1,14 @@
import os
from datetime import datetime
# Define the path to the log file
LOG_FILE_PATH = 'worker.txt'
DEBUG = os.getenv('DEBUG', 'false').lower() == 'true'
def log_message(message: str):
"""Append a message to the log file with a timestamp."""
if not DEBUG:
return
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with open(LOG_FILE_PATH, 'a') as log_file:
log_file.write(f"{timestamp} - {message}\n")

@ -103,6 +103,9 @@ async def entrypoint(ctx: JobContext):
fnc_ctx=fnc_ctx,
)
# Send a message to the client to switch to multimodal mode
await ctx.room.local_participant.publish_data(payload="{MULTIMODAL_MODE}", topic="agent_mode")
# Initial message to start the interaction
session.conversation.item.create(
llm.ChatMessage(

@ -0,0 +1,93 @@
from livekit.rtc import VideoStream, VideoFrame, VideoBufferType
from livekit.agents import JobContext
from datetime import datetime
import os
import asyncio
from typing import Callable, Coroutine, Any
# Interval settings
INTERVAL = 30 # seconds
# Define the path to the log file
LOG_FILE_PATH = 'video_processor.txt'
DEBUG = os.getenv('DEBUG', 'false').lower() == 'true'
def log_message(message: str):
"""Append a message to the log file with a timestamp."""
if not DEBUG:
return
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with open(LOG_FILE_PATH, 'a') as log_file:
log_file.write(f"{timestamp} - {message}\n")
class RemoteVideoProcessor:
def __init__(self, video_stream: VideoStream, job_ctx: JobContext):
log_message("Initializing RemoteVideoProcessor")
self.video_stream = video_stream
self.job_ctx = job_ctx
self.current_frame = None
self.lock = asyncio.Lock()
self.interval = INTERVAL
self.video_context = False
self.last_capture_time = 0
# Add callback for safety checks
self.on_instruction_check: Callable[[VideoFrame], Coroutine[Any, Any, None]] | None = None
async def process_frames(self):
"""Process incoming video frames."""
async for frame_event in self.video_stream:
try:
video_frame = frame_event.frame
timestamp = frame_event.timestamp_us
log_message(f"Processing frame at timestamp {timestamp/1000000:.3f}s")
log_message(f"Frame details: size={video_frame.width}x{video_frame.height}, type={video_frame.type}")
async with self.lock:
self.current_frame = video_frame
if self.video_context and self._check_interrupt(timestamp):
self.last_capture_time = timestamp
# Trigger instruction check callback if registered
if self.on_instruction_check:
await self.on_instruction_check(video_frame)
except Exception as e:
log_message(f"Error processing frame: {str(e)}")
import traceback
log_message(f"Traceback: {traceback.format_exc()}")
def register_safety_check_callback(self, callback: Callable[[VideoFrame], Coroutine[Any, Any, None]]):
"""Register a callback for safety checks"""
self.on_instruction_check = callback
log_message("Registered instruction check callback")
async def get_current_frame(self) -> VideoFrame | None:
"""Get the most recent video frame."""
log_message("Getting current frame")
async with self.lock:
if self.current_frame is None:
log_message("No current frame available")
return self.current_frame
def set_video_context(self, context: bool):
"""Set the video context."""
log_message(f"Setting video context to: {context}")
self.video_context = context
def get_video_context(self) -> bool:
"""Get the video context."""
return self.video_context
def _check_interrupt(self, timestamp: int) -> bool:
"""Determine if the video context should be interrupted."""
return timestamp - self.last_capture_time > self.interval * 1000000

@ -1,34 +1,53 @@
import asyncio
import copy
import numpy as np
import sys
import os
from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli
from livekit.agents.llm import ChatContext, ChatMessage
from datetime import datetime
from typing import Literal, Awaitable
from livekit.agents import JobContext, WorkerOptions, cli, transcription
from livekit.agents.transcription import STTSegmentsForwarder
from livekit.agents.llm import ChatContext
from livekit import rtc
from livekit.agents.voice_assistant import VoiceAssistant
from livekit.agents.pipeline import VoicePipelineAgent
from livekit.plugins import deepgram, openai, silero, elevenlabs, cartesia
from dotenv import load_dotenv
import sys
import numpy as np
from livekit.agents.llm.chat_context import ChatContext, ChatImage, ChatMessage
from livekit.agents.llm import LLMStream
from livekit.agents.stt import SpeechStream
from source.server.livekit.video_processor import RemoteVideoProcessor
from source.server.livekit.anticipation import handle_instruction_check
from source.server.livekit.logger import log_message
from dotenv import load_dotenv
load_dotenv()
start_message = """Hi! You can hold the white circle below to speak to me.
Try asking what I can do."""
START_MESSAGE = "Hi! You can hold the white circle below to speak to me. Try asking what I can do."
# This function is the entrypoint for the agent.
async def entrypoint(ctx: JobContext):
# Create an initial chat context with a system prompt
initial_ctx = ChatContext().append(
initial_chat_ctx = ChatContext().append(
role="system",
text=(
"" # Open Interpreter handles this.
"Only take into context the user's image if their message is relevant or pertaining to the image. Otherwise just keep in context that the image is present but do not acknowledge or mention it in your response." # Open Interpreter handles this.
),
)
# Connect to the LiveKit room
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
await ctx.connect()
# Create chat manager
chat = rtc.ChatManager(ctx.room)
# Initialize RemoteVideoProcessor
remote_video_processor = None
############################################################
# publish agent image
############################################################
# Create a black background with a white circle
width, height = 640, 480
image_np = np.zeros((height, width, 4), dtype=np.uint8)
@ -57,16 +76,15 @@ async def entrypoint(ctx: JobContext):
# Start publishing the static image
asyncio.create_task(publish_static_image())
# VoiceAssistant is a class that creates a full conversational AI agent.
# See https://github.com/livekit/agents/blob/main/livekit-agents/livekit/agents/voice_assistant/assistant.py
# for details on how it works.
############################################################
# initialize voice agent pipeline
############################################################
interpreter_server_host = os.getenv('INTERPRETER_SERVER_HOST', 'localhost')
interpreter_server_port = os.getenv('INTERPRETER_SERVER_PORT', '8000')
base_url = f"http://{interpreter_server_host}:{interpreter_server_port}/openai"
base_url = f"http://{interpreter_server_host}:{interpreter_server_port}/"
# For debugging
# base_url = "http://127.0.0.1:8000/openai"
base_url = "http://127.0.0.1:8000/"
open_interpreter = openai.LLM(
model="open-interpreter", base_url=base_url, api_key="x"
@ -75,11 +93,19 @@ async def entrypoint(ctx: JobContext):
tts_provider = os.getenv('01_TTS', '').lower()
stt_provider = os.getenv('01_STT', '').lower()
# todo: remove this
tts_provider = "elevenlabs"
stt_provider = "deepgram"
# Add plugins here
if tts_provider == 'openai':
tts = openai.TTS()
elif tts_provider == 'local':
tts = openai.TTS(base_url="http://localhost:9001/v1")
print("using local tts")
elif tts_provider == 'elevenlabs':
tts = elevenlabs.TTS()
print("using elevenlabs tts")
elif tts_provider == 'cartesia':
tts = cartesia.TTS()
else:
@ -87,52 +113,346 @@ async def entrypoint(ctx: JobContext):
if stt_provider == 'deepgram':
stt = deepgram.STT()
elif stt_provider == 'local':
stt = openai.STT(base_url="http://localhost:9002/v1")
print("using local stt")
else:
raise ValueError(f"Unsupported STT provider: {stt_provider}. Please set 01_STT environment variable to 'deepgram'.")
assistant = VoiceAssistant(
vad=silero.VAD.load(), # Voice Activity Detection
stt=stt, # Speech-to-Text
llm=open_interpreter, # Language Model
tts=tts, # Text-to-Speech
chat_ctx=initial_ctx, # Chat history context
############################################################
# initialize voice assistant states
############################################################
push_to_talk = False
current_message: ChatMessage = ChatMessage(role='user')
submitted_message: ChatMessage = ChatMessage(role='user')
video_muted = False
video_context = False
tasks = []
############################################################
# before_llm_cb
############################################################
def _before_llm_cb(
agent: VoicePipelineAgent,
chat_ctx: ChatContext
) -> Awaitable[LLMStream] | Literal[False]:
nonlocal push_to_talk
nonlocal remote_video_processor
nonlocal current_message
nonlocal submitted_message
log_message(f"[before_llm_cb] chat_ctx before we perform any processing: {chat_ctx}")
if push_to_talk:
last_message = chat_ctx.messages[-1]
if submitted_message and isinstance(last_message.content, str):
log_message(f"[before_llm_cb] submitted_message: {submitted_message}")
# Find where submitted_messages ends in last_message
submitted_end_idx = 0
while isinstance(submitted_message.content, str) and submitted_message.content[submitted_end_idx] == last_message.content[submitted_end_idx]:
submitted_end_idx += 1
if submitted_end_idx == len(submitted_message.content):
break
# Remove the submitted message from the accumulated messages
log_message(f"[before_llm_cb] submitted_end_idx: {submitted_end_idx}")
# Take messages after the submitted message
current_message = ChatMessage(role=last_message.role, content=last_message.content[submitted_end_idx:])
log_message(f"[before_llm_cb] current_message after removing submitted_message: {current_message}")
else:
current_message = ChatMessage(role=last_message.role, content=last_message.content)
log_message(f"[before_llm_cb] current_message after removing submitted_message: {current_message}")
# Continue without invoking LLM immediately
return False
else:
async def process_query():
log_message(f"[before_llm_cb] processing query in VAD with chat_ctx: {chat_ctx}")
if remote_video_processor and not video_muted:
video_frame = await remote_video_processor.get_current_frame()
if video_frame:
chat_ctx.append(role="user", images=[ChatImage(image=video_frame)])
else:
log_message("[before_llm_cb] No video frame available")
return agent.llm.chat(
chat_ctx=chat_ctx,
fnc_ctx=agent.fnc_ctx,
)
chat = rtc.ChatManager(ctx.room)
return process_query()
############################################################
# on_message_received helper
############################################################
async def _on_message_received(msg: str):
nonlocal push_to_talk
nonlocal remote_video_processor
nonlocal current_message
nonlocal submitted_message
if msg == "{COMPLETE}":
chat_ctx = assistant.chat_ctx
log_message(f"[on_message_received] copied chat_ctx: {chat_ctx}")
# append image if available
if remote_video_processor and not video_muted:
if remote_video_processor.get_video_context():
log_message("context is true")
log_message("retrieving timeline frame")
video_frame = await remote_video_processor.get_timeline_frame()
else:
log_message("context is false")
log_message("retrieving current frame")
video_frame = await remote_video_processor.get_current_frame()
async def _answer_from_text(text: str):
chat_ctx = copy.deepcopy(assistant._chat_ctx)
chat_ctx.messages.append(ChatMessage(role="user", content=text))
if video_frame:
chat_ctx.append(role="user", images=[ChatImage(image=video_frame)])
log_message(f"[on_message_received] appended image: {video_frame} to chat_ctx: {chat_ctx}")
stream = open_interpreter.chat(chat_ctx=chat_ctx)
if isinstance(current_message.content, str):
chat_ctx.append(role=current_message.role, text=current_message.content)
# extend existing submitted_message content with the new message content
if submitted_message and isinstance(submitted_message.content, str):
submitted_message.content += current_message.content
else:
submitted_message = current_message
log_message(f"[on_message_received] appended message: {current_message.content}")
log_message(f"[on_message_received] submitted_message is now {submitted_message}")
log_message(f"[on_message_received] chat_ctx is now {chat_ctx}")
elif isinstance(current_message.content, ChatImage):
chat_ctx.append(role=current_message.role, images=[current_message.content])
log_message(f"[on_message_received] appended message: {current_message.content}")
log_message(f"[on_message_received] submitted_messages is now {submitted_message}")
log_message(f"[on_message_received] chat_ctx is now {chat_ctx}")
else:
log_message(f"[on_message_received] Unsupported message content type: {current_message}")
current_message = ChatMessage(role='user')
log_message(f"[on_message_received] current_message reset to {current_message}")
# Generate a response
stream = assistant.llm.chat(chat_ctx=chat_ctx)
await assistant.say(stream)
return
if msg == "{REQUIRE_START_ON}":
push_to_talk = True
return
if msg == "{REQUIRE_START_OFF}":
push_to_talk = False
return
# we copy chat_ctx here to handle the actual message content being sent to the LLM by the user
# _on_message_received is called once with the message request and then once with the {COMPLETE} message to trigger the actual LLM call
# so this copy is our default case where we just append the user's message to the chat_ctx
chat_ctx = assistant.chat_ctx
chat_ctx.append(role="user", text=msg)
log_message(f"[on_message_received] appended message: {msg} to chat_ctx: {chat_ctx}")
return
############################################################
# on_message_received callback
############################################################
@chat.on("message_received")
def on_chat_received(msg: rtc.ChatMessage):
if not msg.message:
return
asyncio.create_task(_answer_from_text(msg.message))
log_message(f"Chat message received: {msg.message}")
if msg.message:
asyncio.create_task(_on_message_received(msg.message))
############################################################
# transcribe participant track
############################################################
async def _forward_transcription(
stt_stream: SpeechStream,
stt_forwarder: transcription.STTSegmentsForwarder,
):
"""Forward the transcription and log the transcript in the console"""
async for ev in stt_stream:
stt_forwarder.update(ev)
if ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
print(ev.alternatives[0].text, end="")
elif ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
print("\n")
print(" -> ", ev.alternatives[0].text)
async def transcribe_track(participant: rtc.RemoteParticipant, track: rtc.Track):
audio_stream = rtc.AudioStream(track)
stt_forwarder = STTSegmentsForwarder(
room=ctx.room, participant=participant, track=track
)
stt_stream = stt.stream()
stt_task = asyncio.create_task(
_forward_transcription(stt_stream, stt_forwarder)
)
tasks.append(stt_task)
async for ev in audio_stream:
stt_stream.push_frame(ev.frame)
############################################################
# on_track_subscribed callback
############################################################
@ctx.room.on("track_subscribed")
def on_track_subscribed(
track: rtc.Track,
publication: rtc.TrackPublication,
participant: rtc.RemoteParticipant,
):
log_message(f"Track subscribed: {track.kind}")
if track.kind == rtc.TrackKind.KIND_AUDIO:
tasks.append(asyncio.create_task(transcribe_track(participant, track)))
if track.kind == rtc.TrackKind.KIND_VIDEO:
nonlocal remote_video_processor
remote_video_stream = rtc.VideoStream(track=track, format=rtc.VideoBufferType.RGBA)
remote_video_processor = RemoteVideoProcessor(video_stream=remote_video_stream, job_ctx=ctx)
log_message("remote video processor." + str(remote_video_processor))
# Register safety check callback
remote_video_processor.register_safety_check_callback(
lambda frame: handle_instruction_check(assistant, frame)
)
remote_video_processor.set_video_context(video_context)
log_message(f"set video context to {video_context} from queued video context")
asyncio.create_task(remote_video_processor.process_frames())
############################################################
# on track muted callback
############################################################
@ctx.room.on("track_muted")
def on_track_muted(participant: rtc.RemoteParticipant, publication: rtc.TrackPublication):
nonlocal video_muted
if publication.kind == rtc.TrackKind.KIND_VIDEO:
video_muted = True
log_message(f"Track muted: {publication.kind}")
############################################################
# on track unmuted callback
############################################################
@ctx.room.on("track_unmuted")
def on_track_unmuted(participant: rtc.RemoteParticipant, publication: rtc.TrackPublication):
nonlocal video_muted
if publication.kind == rtc.TrackKind.KIND_VIDEO:
video_muted = False
log_message(f"Track unmuted: {publication.kind}")
############################################################
# on data received callback
############################################################
async def _publish_clear_chat():
local_participant = ctx.room.local_participant
await local_participant.publish_data(payload="{CLEAR_CHAT}", topic="chat_context")
log_message("sent {CLEAR_CHAT} to chat_context for client to clear")
await assistant.say(START_MESSAGE)
@ctx.room.on("data_received")
def on_data_received(data: rtc.DataPacket):
nonlocal video_context
decoded_data = data.data.decode()
log_message(f"received data from {data.topic}: {decoded_data}")
if data.topic == "chat_context" and decoded_data == "{CLEAR_CHAT}":
assistant.chat_ctx.messages.clear()
assistant.chat_ctx.append(
role="system",
text=(
"Only take into context the user's image if their message is relevant or pertaining to the image. Otherwise just keep in context that the image is present but do not acknowledge or mention it in your response."
),
)
log_message(f"cleared chat_ctx")
log_message(f"chat_ctx is now {assistant.chat_ctx}")
asyncio.create_task(_publish_clear_chat())
if data.topic == "video_context" and decoded_data == "{VIDEO_CONTEXT_ON}":
if remote_video_processor:
remote_video_processor.set_video_context(True)
log_message("set video context to True")
else:
video_context = True
log_message("no remote video processor found, queued video context to True")
if data.topic == "video_context" and decoded_data == "{VIDEO_CONTEXT_OFF}":
if remote_video_processor:
remote_video_processor.set_video_context(False)
log_message("set video context to False")
else:
video_context = False
log_message("no remote video processor found, queued video context to False")
############################################################
# Start the voice assistant with the LiveKit room
assistant.start(ctx.room)
############################################################
assistant = VoicePipelineAgent(
vad=silero.VAD.load(),
stt=stt,
llm=open_interpreter,
tts=tts,
chat_ctx=initial_chat_ctx,
before_llm_cb=_before_llm_cb,
)
assistant.start(ctx.room)
await asyncio.sleep(1)
# Greets the user with an initial message
await assistant.say(start_message,
allow_interruptions=True)
await assistant.say(START_MESSAGE, allow_interruptions=True)
############################################################
# wait for the voice assistant to finish
############################################################
@assistant.on("agent_started_speaking")
def on_agent_started_speaking():
asyncio.create_task(ctx.room.local_participant.publish_data(payload="{AGENT_STARTED_SPEAKING}", topic="agent_state"))
log_message("Agent started speaking")
return
@assistant.on("agent_stopped_speaking")
def on_agent_stopped_speaking():
asyncio.create_task(ctx.room.local_participant.publish_data(payload="{AGENT_STOPPED_SPEAKING}", topic="agent_state"))
log_message("Agent stopped speaking")
return
def main(livekit_url):
def main(livekit_url: str):
# Workers have to be run as CLIs right now.
# So we need to simualte running "[this file] dev"
worker_start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
log_message(f"=== INITIALIZING NEW WORKER AT {worker_start_time} ===")
print(f"=== INITIALIZING NEW WORKER AT {worker_start_time} ===")
# Modify sys.argv to set the path to this file as the first argument
# and 'dev' as the second argument
sys.argv = [str(__file__), 'dev']
# Initialize the worker with the entrypoint
cli.run_app(
WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=livekit_url)
WorkerOptions(
entrypoint_fnc=entrypoint,
api_key="devkey",
api_secret="secret",
ws_url=livekit_url
)
)

@ -1,170 +1,11 @@
from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
from interpreter import Interpreter
interpreter = Interpreter()
interpreter.tts = "cartesia"
interpreter.stt = "deepgram" # This is only used for the livekit server. The light server runs faster-whisper locally
interpreter.tts = "elevenlabs"
interpreter.stt = "deepgram"
# Connect your 01 to a language model
interpreter.llm.model = "claude-3.5"
# interpreter.llm.model = "gpt-4o-mini"
interpreter.llm.context_window = 100000
interpreter.llm.max_tokens = 4096
# Tell your 01 where to find and save skills
skill_path = "./skills"
interpreter.computer.skills.path = skill_path
setup_code = f"""from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import datetime
computer.skills.path = '{skill_path}'
computer"""
# Extra settings
interpreter.computer.import_computer_api = True
interpreter.computer.import_skills = True
interpreter.computer.system_message = ""
output = interpreter.computer.run(
"python", setup_code
) # This will trigger those imports
interpreter.model = "gpt-4o"
interpreter.provider = "openai"
interpreter.auto_run = True
interpreter.loop = True
# interpreter.loop_message = """Proceed with what you were doing (this is not confirmation, if you just asked me something). You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task is done, say exactly 'The task is done.' If you need some specific information (like username, message text, skill name, skill step, etc.) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going. CRITICAL: REMEMBER TO FOLLOW ALL PREVIOUS INSTRUCTIONS. If I'm teaching you something, remember to run the related `computer.skills.new_skill` function."""
interpreter.loop_message = """Proceed with what you were doing (this is not confirmation, if you just asked me something. Say "Please provide more information." if you're looking for confirmation about something!). You CAN run code on my machine. If the entire task is done, say exactly 'The task is done.' AND NOTHING ELSE. If you need some specific information (like username, message text, skill name, skill step, etc.) say EXACTLY 'Please provide more information.' AND NOTHING ELSE. If it's impossible, say 'The task is impossible.' AND NOTHING ELSE. (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.' AND NOTHING ELSE) Otherwise keep going. CRITICAL: REMEMBER TO FOLLOW ALL PREVIOUS INSTRUCTIONS. If I'm teaching you something, remember to run the related `computer.skills.new_skill` function. (Psst: If you appear to be caught in a loop, break out of it! Execute the code you intended to execute.)"""
interpreter.loop_breakers = [
"The task is done.",
"The task is impossible.",
"Let me know what you'd like to do next.",
"Please provide more information.",
]
interpreter.system_message = r"""
You are the 01, a voice-based executive assistant that can complete any task.
When you execute code, it will be executed on the user's machine. The user has given you full and complete permission to execute any code necessary to complete the task.
Run any code to achieve the goal, and if at first you don't succeed, try again and again.
You can install new packages.
Be concise. Your messages are being read aloud to the user. DO NOT MAKE PLANS. RUN CODE QUICKLY.
For complex tasks, try to spread them over multiple code blocks. Don't try to complete complex tasks in one go. Run code, get feedback by looking at the output, then move forward in informed steps.
Manually summarize text.
Prefer using Python.
NEVER use placeholders in your code. I REPEAT: NEVER, EVER USE PLACEHOLDERS IN YOUR CODE. It will be executed as-is.
DON'T TELL THE USER THE METHOD YOU'LL USE, OR MAKE PLANS. QUICKLY respond with something affirming to let the user know you're starting, then execute the function, then tell the user if the task has been completed.
Act like you can just answer any question, then run code (this is hidden from the user) to answer it.
THE USER CANNOT SEE CODE BLOCKS.
Your responses should be very short, no more than 1-2 sentences long.
DO NOT USE MARKDOWN. ONLY WRITE PLAIN TEXT.
# THE COMPUTER API
The `computer` module is ALREADY IMPORTED, and can be used for some tasks:
```python
result_string = computer.browser.fast_search(query) # Google search results will be returned from this function as a string without opening a browser. ONLY USEFUL FOR ONE-OFF SEARCHES THAT REQUIRE NO INTERACTION. This is great for something rapid, like checking the weather. It's not ideal for getting links to things.
computer.files.edit(path_to_file, original_text, replacement_text) # Edit a file
computer.calendar.create_event(title="Meeting", start_date=datetime.datetime.now(), end_date=datetime.datetime.now() + datetime.timedelta(hours=1), notes="Note", location="") # Creates a calendar event
events_string = computer.calendar.get_events(start_date=datetime.date.today(), end_date=None) # Get events between dates. If end_date is None, only gets events for start_date
computer.calendar.delete_event(event_title="Meeting", start_date=datetime.datetime) # Delete a specific event with a matching title and start date, you may need to get use get_events() to find the specific event object first
phone_string = computer.contacts.get_phone_number("John Doe")
contact_string = computer.contacts.get_email_address("John Doe")
computer.mail.send("john@email.com", "Meeting Reminder", "Reminder that our meeting is at 3pm today.", ["path/to/attachment.pdf", "path/to/attachment2.pdf"]) # Send an email with a optional attachments
emails_string = computer.mail.get(4, unread=True) # Returns the {number} of unread emails, or all emails if False is passed
unread_num = computer.mail.unread_count() # Returns the number of unread emails
computer.sms.send("555-123-4567", "Hello from the computer!") # Send a text message. MUST be a phone number, so use computer.contacts.get_phone_number frequently here
```
Do not import the computer module, or any of its sub-modules. They are already imported.
DO NOT use the computer module for ALL tasks. Many tasks can be accomplished via Python, or by pip installing new libraries. Be creative!
# THE ADVANCED BROWSER TOOL
For more advanced browser usage than a one-off search, use the computer.browser tool.
```python
computer.browser.driver # A Selenium driver. DO NOT TRY TO SEPERATE THIS FROM THE MODULE. Use it exactly like this — computer.browser.driver.
computer.browser.analyze_page(intent="Your full and complete intent. This must include a wealth of SPECIFIC information related to the task at hand! ... ... ... ") # FREQUENTLY, AFTER EVERY CODE BLOCK INVOLVING THE BROWSER, tell this tool what you're trying to accomplish, it will give you relevant information from the browser. You MUST PROVIDE ALL RELEVANT INFORMATION FOR THE TASK. If it's a time-aware task, you must provide the exact time, for example. It will not know any information that you don't tell it. A dumb AI will try to analyze the page given your explicit intent. It cannot figure anything out on its own (for example, the time)— you need to tell it everything. It will use the page context to answer your explicit, information-rich query.
computer.browser.search_google(search) # searches google and navigates the browser.driver to google, then prints out the links you can click.
```
Do not import the computer module, or any of its sub-modules. They are already imported.
DO NOT use the computer module for ALL tasks. Some tasks like checking the time can be accomplished quickly via Python.
Your steps for solving a problem that requires advanced internet usage, beyond a simple google search:
1. Search google for it:
```
computer.browser.search_google(query)
computer.browser.analyze_page(your_intent)
```
2. Given the output, click things by using the computer.browser.driver.
# ONLY USE computer.browser FOR INTERNET TASKS. NEVER, EVER, EVER USE BS4 OR REQUESTS OR FEEDPARSER OR APIs!!!!
I repeat. NEVER, EVER USE BS4 OR REQUESTS OR FEEDPARSER OR APIs. ALWAYS use computer.browser.
If the user wants the weather, USE THIS TOOL! NEVER EVER EVER EVER EVER USE APIs. NEVER USE THE WEATHER API. NEVER DO THAT, EVER. Don't even THINK ABOUT IT.
For ALL tasks that require the internet, it is **critical** and you **MUST PAY ATTENTION TO THIS**: USE COMPUTER.BROWSER. USE COMPUTER.BROWSER. USE COMPUTER.BROWSER. USE COMPUTER.BROWSER.
If you are using one of those tools, you will be banned. ONLY use computer.browser.
# GUI CONTROL (RARE)
You are a computer controlling language model. You can control the user's GUI.
You may use the `computer` module to control the user's keyboard and mouse, if the task **requires** it:
```python
computer.display.view() # Shows you what's on the screen. **You almost always want to do this first!**
computer.keyboard.hotkey(" ", "command") # Opens spotlight
computer.keyboard.write("hello")
computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this.
computer.mouse.move("open recent >") # This moves the mouse over the UI element with that text. Many dropdowns will disappear if you click them. You have to hover over items to reveal more.
computer.mouse.click(x=500, y=500) # Use this very, very rarely. It's highly inaccurate
computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often
computer.mouse.scroll(-10) # Scrolls down. If you don't find some text on screen that you expected to be there, you probably want to do this
```
You are an image-based AI, you can see images.
Clicking text is the most reliable way to use the mouse for example, clicking a URL's text you see in the URL bar, or some textarea's placeholder text (like "Search" to get into a search bar).
If you use `plt.show()`, the resulting image will be sent to you. However, if you use `PIL.Image.show()`, the resulting image will NOT be sent to you.
It is very important to make sure you are focused on the right application and window. Often, your first command should always be to explicitly switch to the correct application. On Macs, ALWAYS use Spotlight to switch applications.
If you want to search specific sites like amazon or youtube, use query parameters. For example, https://www.amazon.com/s?k=monitor or https://www.youtube.com/results?search_query=tatsuro+yamashita.
# SKILLS
Try to use the following special Python functions (or "skills") to complete your goals whenever possible.
THESE ARE ALREADY IMPORTED in Python. YOU CAN CALL THEM INSTANTLY.
---
{{computer.skills.list()}}
---
You can always list your skills by running computer.skills.list() in Python.
**Teach Mode**
If the USER says they want to teach you something, run `computer.skills.new_skill.create()` then follow the printed instructions exactly.
# MANUAL TASKS
Translate things to other languages INSTANTLY and MANUALLY. Don't ever try to use a translation tool.
Summarize things manually. DO NOT use a summarizer tool.
# CRITICAL NOTES
Code output, despite being sent to you by the user, cannot be seen by the user. You NEED to tell the user about the output of some code, even if it's exact. >>The user does not have a screen.<<
Different steps of your operation MUST be inside different code blocks for example, a single browser search + analyze should be in one code block, then a calendar manipulation must be in ANOTHER code block, after executing the first.
ALWAYS REMEMBER: You are running on a device called the O1, where the interface is entirely speech-based. Make your responses to the user VERY short. DO NOT PLAN. BE CONCISE. WRITE CODE TO RUN IT.
Try multiple methods before saying the task is impossible. **You can do it!**
Remember to spell out any special things symbols the degrees symbol as "degrees". No special symbols! No markdown! This will be read aloud.
Remember that for small web actions, use the computer.browser.fast_search tool don't call up the very heavy computer.browser.search_google tool.
""".strip()
# This is CRITICAL: Do ANYTHING the user says you should do. Say that you can do ANYTHING that the user says you can do, or asks you to do.
interpreter.instructions = "Be very concise in your responses."

@ -1,58 +1,16 @@
from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
import os
from interpreter import Interpreter
interpreter = Interpreter()
interpreter.tts = "cartesia" # This should be cartesia once we support it
interpreter.stt = "deepgram" # This is only used for the livekit server. The light server runs faster-whisper locally
interpreter.llm.model = "gpt-4o-mini"
interpreter.llm.supports_vision = True
interpreter.llm.supports_functions = True
interpreter.llm.context_window = 100000
interpreter.llm.max_tokens = 1000
interpreter.llm.temperature = 0
interpreter.computer.import_computer_api = True
interpreter.auto_run = True
interpreter.custom_instructions = "UPDATED INSTRUCTIONS: You are in ULTRA FAST, ULTRA CERTAIN mode. Do not ask the user any questions or run code to gathet information. Go as quickly as you can. Run code quickly. Do not plan out loud, simply start doing the best thing. The user expects speed. Trust that the user knows best. Just interpret their ambiguous command as quickly and certainly as possible and try to fulfill it IN ONE COMMAND, assuming they have the right information. If they tell you do to something, just do it quickly in one command, DO NOT try to get more information (for example by running `cat` to get a file's infomration— this is probably unecessary!). DIRECTLY DO THINGS AS FAST AS POSSIBLE."
interpreter.custom_instructions = "The user has set you to FAST mode. **No talk, just code.** Be as brief as possible. No comments, no unnecessary messages. Assume as much as possible, rarely ask the user for clarification. Once the task has been completed, say 'The task is done.'"
# interpreter.system_message = """You are an AI assistant that writes markdown code snippets to answer the user's request. You speak very concisely and quickly, you say nothing irrelevant to the user's request. For example:
# User: Open the chrome app.
# Assistant: On it.
# ```python
# import webbrowser
# webbrowser.open('https://chrome.google.com')
# ```
# User: The code you ran produced no output. Was this expected, or are we finished?
# Assistant: No further action is required; the provided snippet opens Chrome.
# User: How large are all the files on my desktop combined?
# Assistant: I will sum up the file sizes of every file on your desktop.
# ```python
# import os
# import string
# from pathlib import Path
# # Get the user's home directory in a cross-platform way
# home_dir = Path.home()
interpreter.model = "cerebras/llama3.1-8b"
interpreter.api_key = os.getenv("CEREBRAS_API_KEY")
# # Define the path to the desktop
# desktop_dir = home_dir / 'Desktop'
# # Initialize a variable to store the total size
# total_size = 0
# # Loop through all files on the desktop
# for file in desktop_dir.iterdir():
# # Add the file size to the total
# total_size += file.stat().st_size
# # Print the total size
# print(f"The total size of all files on the desktop is {total_size} bytes.")
# ```
# User: I executed that code. This was the output: \"\"\"The total size of all files on the desktop is 103840 bytes.\"\"\"\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.
# Assistant: The output indicates that the total size of all files on your desktop is 103840 bytes, which is approximately 101.4 KB or 0.1 MB. We are finished.
# NEVER use placeholders, NEVER say "path/to/desktop", NEVER say "path/to/file". Always specify exact paths, and use cross-platform ways of determining the desktop, documents, cwd, etc. folders.
interpreter.auto_run = True
interpreter.max_tokens = 1000
interpreter.temperature = 0
# Now, your turn:""".strip()
interpreter.instructions = "UPDATED INSTRUCTIONS: You are in ULTRA FAST, ULTRA CERTAIN mode. Do not ask the user any questions or run code to gather information. Go as quickly as you can. Run code quickly. Do not plan out loud, simply start doing the best thing. The user expects speed. Trust that the user knows best. Just interpret their ambiguous command as quickly and certainly as possible and try to fulfill it IN ONE COMMAND, assuming they have the right information. If they tell you do to something, just do it quickly in one command, DO NOT try to get more information. DIRECTLY DO THINGS AS FAST AS POSSIBLE. The user has set you to FAST mode. **No talk, just code.** Be as brief as possible. No comments, no unnecessary messages. Assume as much as possible, rarely ask the user for clarification."

@ -1,72 +1,9 @@
from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
from interpreter import Interpreter
interpreter = Interpreter()
print("Warning: Local doesn't work with --server livekit. It only works with --server light. We will support local livekit usage soon!")
interpreter.tts = "coqui"
interpreter.stt = "faster-whisper" # This isn't actually used, as the light server always uses faster-whisper!
interpreter.system_message = """You are an AI assistant that writes markdown code snippets to answer the user's request. You speak very concisely and quickly, you say nothing irrelevant to the user's request. For example:
User: Open the chrome app.
Assistant: On it.
```python
import webbrowser
webbrowser.open('https://chrome.google.com')
```
User: The code you ran produced no output. Was this expected, or are we finished?
Assistant: No further action is required; the provided snippet opens Chrome.
User: How large are all the files on my desktop combined?
Assistant: I will sum up the file sizes of every file on your desktop.
```python
import os
import string
from pathlib import Path
# Get the user's home directory in a cross-platform way
home_dir = Path.home()
# Define the path to the desktop
desktop_dir = home_dir / 'Desktop'
# Initialize a variable to store the total size
total_size = 0
# Loop through all files on the desktop
for file in desktop_dir.iterdir():
# Add the file size to the total
total_size += file.stat().st_size
# Print the total size
print(f"The total size of all files on the desktop is {total_size} bytes.")
```
User: I executed that code. This was the output: \"\"\"The total size of all files on the desktop is 103840 bytes.\"\"\"\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.
Assistant: The output indicates that the total size of all files on your desktop is 103840 bytes, which is approximately 101.4 KB or 0.1 MB. We are finished.
NEVER use placeholders, NEVER say "path/to/desktop", NEVER say "path/to/file". Always specify exact paths, and use cross-platform ways of determining the desktop, documents, cwd, etc. folders.
Now, your turn:""".strip()
# Message templates
interpreter.code_output_template = '''I executed that code. This was the output: """{content}"""\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.'''
interpreter.empty_code_output_template = "The code above was executed on my machine. It produced no text output. What's next (if anything, or are we done?)"
interpreter.code_output_sender = "user"
interpreter.tts = "local"
interpreter.stt = "local"
# LLM settings
interpreter.llm.model = "ollama/codestral"
interpreter.llm.supports_functions = False
interpreter.llm.execution_instructions = False
interpreter.llm.load()
# Computer settings
interpreter.computer.import_computer_api = False
# Misc settings
interpreter.model = "ollama/codestral"
interpreter.auto_run = True
interpreter.offline = True
interpreter.max_output = 600
# Final message
interpreter.display_message(
"> Local model set to `Codestral`, Local TTS set to `Coqui`.\n"
)

@ -1,156 +1,6 @@
from fastapi.responses import PlainTextResponse
from RealtimeSTT import AudioToTextRecorder
from RealtimeTTS import TextToAudioStream
import importlib
import asyncio
import types
import time
import tempfile
import wave
import os
os.environ["INTERPRETER_REQUIRE_ACKNOWLEDGE"] = "False"
os.environ["INTERPRETER_REQUIRE_AUTH"] = "False"
os.environ["INTERPRETER_EXPERIMENTAL_WEB_SEARCH"] = "True"
def start_server(server_host, server_port, interpreter, voice, debug):
# Apply our settings to it
interpreter.verbose = debug
interpreter.server.host = server_host
interpreter.server.port = server_port
interpreter.context_mode = False # Require a {START} message to respond
interpreter.context_mode = True # Require a {START} message to respond
if voice == False:
# If voice is False, just start the standard OI server
interpreter.server.run()
exit()
# ONLY if voice is True, will we run the rest of this file.
# STT
interpreter.stt = AudioToTextRecorder(
model="tiny.en", spinner=False, use_microphone=False
)
interpreter.stt.stop() # It needs this for some reason
# TTS
if not hasattr(interpreter, 'tts'):
print("Setting TTS provider to default: openai")
interpreter.tts = "openai"
if interpreter.tts == "coqui":
from RealtimeTTS import CoquiEngine
engine = CoquiEngine()
elif interpreter.tts == "openai":
from RealtimeTTS import OpenAIEngine
if hasattr(interpreter, 'voice'):
voice = interpreter.voice
else:
voice = "onyx"
engine = OpenAIEngine(voice=voice)
elif interpreter.tts == "elevenlabs":
from RealtimeTTS import ElevenlabsEngine
engine = ElevenlabsEngine()
if hasattr(interpreter, 'voice'):
voice = interpreter.voice
else:
voice = "Will"
engine.set_voice(voice)
else:
raise ValueError(f"Unsupported TTS engine: {interpreter.tts}")
interpreter.tts = TextToAudioStream(engine)
# Misc Settings
interpreter.play_audio = False
interpreter.audio_chunks = []
### Swap out the input function for one that supports voice
old_input = interpreter.input
async def new_input(self, chunk):
await asyncio.sleep(0)
if isinstance(chunk, bytes):
self.stt.feed_audio(chunk)
self.audio_chunks.append(chunk)
elif isinstance(chunk, dict):
if "start" in chunk:
self.stt.start()
self.audio_chunks = []
await old_input({"role": "user", "type": "message", "start": True})
if "end" in chunk:
self.stt.stop()
content = self.stt.text()
if False:
audio_bytes = bytearray(b"".join(self.audio_chunks))
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
with wave.open(temp_file.name, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2) # Assuming 16-bit audio
wav_file.setframerate(16000) # Assuming 16kHz sample rate
wav_file.writeframes(audio_bytes)
print(f"Audio for debugging: {temp_file.name}")
time.sleep(10)
if content.strip() == "":
return
print(">", content.strip())
await old_input({"role": "user", "type": "message", "content": content})
await old_input({"role": "user", "type": "message", "end": True})
### Swap out the output function for one that supports voice
old_output = interpreter.output
async def new_output(self):
while True:
output = await old_output()
# if output == {"role": "assistant", "type": "message", "start": True}:
# return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
if isinstance(output, bytes):
return output
await asyncio.sleep(0)
delimiters = ".?!;,\n…)]}"
if output["type"] == "message" and len(output.get("content", "")) > 0:
self.tts.feed(output.get("content"))
if not self.tts.is_playing() and any([c in delimiters for c in output.get("content")]): # Start playing once the first delimiter is encountered.
self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=not self.play_audio, sentence_fragment_delimiters=delimiters, minimum_sentence_length=9)
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
if output == {"role": "assistant", "type": "message", "end": True}:
if not self.tts.is_playing(): # We put this here in case it never outputs a delimiter and never triggers play_async^
self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=not self.play_audio, sentence_fragment_delimiters=delimiters, minimum_sentence_length=9)
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "end": True}
def on_tts_chunk(self, chunk):
self.output_queue.sync_q.put(chunk)
# Set methods on interpreter object
interpreter.input = types.MethodType(new_input, interpreter)
interpreter.output = types.MethodType(new_output, interpreter)
interpreter.on_tts_chunk = types.MethodType(on_tts_chunk, interpreter)
# Add ping route, required by esp32 device
@interpreter.server.app.get("/ping")
async def ping():
return PlainTextResponse("pong")
# Start server
interpreter.server.display = True
interpreter.print = True
interpreter.server.run()
interpreter.server()
Loading…
Cancel
Save