From 5810db46923f6625f4ac24e6591b026208f84d48 Mon Sep 17 00:00:00 2001
From: Ben Xu <benx.xu@mail.utoronto.ca>
Date: Wed, 2 Oct 2024 17:32:33 -0400
Subject: [PATCH] init livekit quickstart multimodal worker

---
 software/main.py                             | 11 +++-
 software/pyproject.toml                      | 12 ++---
 software/source/server/livekit/multimodal.py | 54 ++++++++++++++++++++
 3 files changed, 70 insertions(+), 7 deletions(-)
 create mode 100644 software/source/server/livekit/multimodal.py

diff --git a/software/main.py b/software/main.py
index 50c7dae..0cf1bf6 100644
--- a/software/main.py
+++ b/software/main.py
@@ -19,6 +19,7 @@ import time
 from dotenv import load_dotenv
 import signal
 from source.server.livekit.worker import main as worker_main
+from source.server.livekit.multimodal import main as multimodal_main
 import warnings
 import requests
 
@@ -71,6 +72,11 @@ def run(
         "--debug",
         help="Print latency measurements and save microphone recordings locally for manual playback",
     ),
+    multimodal: bool = typer.Option(
+        False,
+        "--multimodal",
+        help="Run the multimodal agent",
+    ),
 ):
 
     threads = []
@@ -274,7 +280,10 @@ def run(
 
             for attempt in range(30):
                 try:
-                    worker_main(local_livekit_url)
+                    if multimodal:
+                        multimodal_main(local_livekit_url)
+                    else:
+                        worker_main(local_livekit_url)
                 except KeyboardInterrupt:
                     print("Exiting.")
                     raise
diff --git a/software/pyproject.toml b/software/pyproject.toml
index 6b75c87..391381d 100644
--- a/software/pyproject.toml
+++ b/software/pyproject.toml
@@ -12,12 +12,12 @@ readme = "../README.md"
 
 [tool.poetry.dependencies]
 python = ">=3.10,<3.12"
-livekit = "^0.12.1"
-livekit-agents = "^0.8.6"
-livekit-plugins-deepgram = "^0.6.5"
-livekit-plugins-openai = "^0.8.1"
-livekit-plugins-silero = "^0.6.4"
-livekit-plugins-elevenlabs = "^0.7.3"
+livekit = "^0.17.2"
+livekit-agents = "^0.10.0"
+livekit-plugins-deepgram = "^0.6.7"
+livekit-plugins-openai = "^0.10.1"
+livekit-plugins-silero = "^0.7.1"
+livekit-plugins-elevenlabs = "^0.7.5"
 segno = "^1.6.1"
 open-interpreter = {git = "https://github.com/openinterpreter/open-interpreter.git", rev = "development", extras = ["os", "server"]}
 ngrok = "^1.4.0"
diff --git a/software/source/server/livekit/multimodal.py b/software/source/server/livekit/multimodal.py
new file mode 100644
index 0000000..a8ae4f2
--- /dev/null
+++ b/software/source/server/livekit/multimodal.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+import sys
+from livekit.agents import (
+    AutoSubscribe,
+    JobContext,
+    WorkerOptions,
+    cli,
+    llm,
+)
+from livekit.agents.multimodal import MultimodalAgent
+from livekit.plugins import openai
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+async def entrypoint(ctx: JobContext):
+    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
+
+    participant = await ctx.wait_for_participant()
+
+    openai_api_key = os.getenv("OPENAI_API_KEY")
+    model = openai.realtime.RealtimeModel(
+        instructions="You are a helpful assistant and you love kittens",
+        voice="shimmer",
+        temperature=0.8,
+        modalities=["audio", "text"],
+        api_key=openai_api_key,
+        base_url="wss://api.openai.com/v1",
+    )
+    assistant = MultimodalAgent(model=model)
+    assistant.start(ctx.room)
+
+    session = model.sessions[0]
+    session.conversation.item.create(
+      llm.ChatMessage(
+        role="user",
+        content="Please begin the interaction with the user in a manner consistent with your instructions.",
+      )
+    )
+    session.response.create()
+
+def main(livekit_url):
+    # Workers have to be run as CLIs right now.
+    # So we need to simualte running "[this file] dev"
+
+    # Modify sys.argv to set the path to this file as the first argument
+    # and 'dev' as the second argument
+    sys.argv = [str(__file__), 'dev']
+
+    # Initialize the worker with the entrypoint
+    cli.run_app(
+        WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=livekit_url, port=8082)
+    )
\ No newline at end of file