diff --git a/software/main.py b/software/main.py index 50c7dae..0cf1bf6 100644 --- a/software/main.py +++ b/software/main.py @@ -19,6 +19,7 @@ import time from dotenv import load_dotenv import signal from source.server.livekit.worker import main as worker_main +from source.server.livekit.multimodal import main as multimodal_main import warnings import requests @@ -71,6 +72,11 @@ def run( "--debug", help="Print latency measurements and save microphone recordings locally for manual playback", ), + multimodal: bool = typer.Option( + False, + "--multimodal", + help="Run the multimodal agent", + ), ): threads = [] @@ -274,7 +280,10 @@ def run( for attempt in range(30): try: - worker_main(local_livekit_url) + if multimodal: + multimodal_main(local_livekit_url) + else: + worker_main(local_livekit_url) except KeyboardInterrupt: print("Exiting.") raise diff --git a/software/pyproject.toml b/software/pyproject.toml index 6b75c87..391381d 100644 --- a/software/pyproject.toml +++ b/software/pyproject.toml @@ -12,12 +12,12 @@ readme = "../README.md" [tool.poetry.dependencies] python = ">=3.10,<3.12" -livekit = "^0.12.1" -livekit-agents = "^0.8.6" -livekit-plugins-deepgram = "^0.6.5" -livekit-plugins-openai = "^0.8.1" -livekit-plugins-silero = "^0.6.4" -livekit-plugins-elevenlabs = "^0.7.3" +livekit = "^0.17.2" +livekit-agents = "^0.10.0" +livekit-plugins-deepgram = "^0.6.7" +livekit-plugins-openai = "^0.10.1" +livekit-plugins-silero = "^0.7.1" +livekit-plugins-elevenlabs = "^0.7.5" segno = "^1.6.1" open-interpreter = {git = "https://github.com/openinterpreter/open-interpreter.git", rev = "development", extras = ["os", "server"]} ngrok = "^1.4.0" diff --git a/software/source/server/livekit/multimodal.py b/software/source/server/livekit/multimodal.py new file mode 100644 index 0000000..a8ae4f2 --- /dev/null +++ b/software/source/server/livekit/multimodal.py @@ -0,0 +1,54 @@ +from __future__ import annotations +import sys +from livekit.agents import ( + AutoSubscribe, + JobContext, + WorkerOptions, + cli, + llm, +) +from livekit.agents.multimodal import MultimodalAgent +from livekit.plugins import openai +from dotenv import load_dotenv +import os + +load_dotenv() + +async def entrypoint(ctx: JobContext): + await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) + + participant = await ctx.wait_for_participant() + + openai_api_key = os.getenv("OPENAI_API_KEY") + model = openai.realtime.RealtimeModel( + instructions="You are a helpful assistant and you love kittens", + voice="shimmer", + temperature=0.8, + modalities=["audio", "text"], + api_key=openai_api_key, + base_url="wss://api.openai.com/v1", + ) + assistant = MultimodalAgent(model=model) + assistant.start(ctx.room) + + session = model.sessions[0] + session.conversation.item.create( + llm.ChatMessage( + role="user", + content="Please begin the interaction with the user in a manner consistent with your instructions.", + ) + ) + session.response.create() + +def main(livekit_url): + # Workers have to be run as CLIs right now. + # So we need to simualte running "[this file] dev" + + # Modify sys.argv to set the path to this file as the first argument + # and 'dev' as the second argument + sys.argv = [str(__file__), 'dev'] + + # Initialize the worker with the entrypoint + cli.run_app( + WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=livekit_url, port=8082) + ) \ No newline at end of file