Merge pull request #309 from benxu3/livekit-realtime

add realtime livekit multimodal worker
pull/311/head
killian 3 months ago committed by GitHub
commit 207ec088b4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -19,6 +19,7 @@ import time
from dotenv import load_dotenv from dotenv import load_dotenv
import signal import signal
from source.server.livekit.worker import main as worker_main from source.server.livekit.worker import main as worker_main
from source.server.livekit.multimodal import main as multimodal_main
import warnings import warnings
import requests import requests
@ -71,6 +72,11 @@ def run(
"--debug", "--debug",
help="Print latency measurements and save microphone recordings locally for manual playback", help="Print latency measurements and save microphone recordings locally for manual playback",
), ),
multimodal: bool = typer.Option(
False,
"--multimodal",
help="Run the multimodal agent",
),
): ):
threads = [] threads = []
@ -274,7 +280,10 @@ def run(
for attempt in range(30): for attempt in range(30):
try: try:
worker_main(local_livekit_url) if multimodal:
multimodal_main(local_livekit_url)
else:
worker_main(local_livekit_url)
except KeyboardInterrupt: except KeyboardInterrupt:
print("Exiting.") print("Exiting.")
raise raise

@ -12,12 +12,12 @@ readme = "../README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.10,<3.12" python = ">=3.10,<3.12"
livekit = "^0.12.1" livekit = "^0.17.2"
livekit-agents = "^0.8.6" livekit-agents = "^0.10.0"
livekit-plugins-deepgram = "^0.6.5" livekit-plugins-deepgram = "^0.6.7"
livekit-plugins-openai = "^0.8.1" livekit-plugins-openai = "^0.10.1"
livekit-plugins-silero = "^0.6.4" livekit-plugins-silero = "^0.7.1"
livekit-plugins-elevenlabs = "^0.7.3" livekit-plugins-elevenlabs = "^0.7.5"
segno = "^1.6.1" segno = "^1.6.1"
open-interpreter = {extras = ["os", "server"], version = "^0.3.12"} # You should add a "browser" extra, so selenium isn't in the main package open-interpreter = {extras = ["os", "server"], version = "^0.3.12"} # You should add a "browser" extra, so selenium isn't in the main package
ngrok = "^1.4.0" ngrok = "^1.4.0"

@ -0,0 +1,54 @@
from __future__ import annotations
import sys
from livekit.agents import (
AutoSubscribe,
JobContext,
WorkerOptions,
cli,
llm,
)
from livekit.agents.multimodal import MultimodalAgent
from livekit.plugins import openai
from dotenv import load_dotenv
import os
load_dotenv()
async def entrypoint(ctx: JobContext):
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
participant = await ctx.wait_for_participant()
openai_api_key = os.getenv("OPENAI_API_KEY")
model = openai.realtime.RealtimeModel(
instructions="You are a helpful assistant and you love open-source software",
voice="shimmer",
temperature=0.8,
modalities=["audio", "text"],
api_key=openai_api_key,
base_url="wss://api.openai.com/v1",
)
assistant = MultimodalAgent(model=model)
assistant.start(ctx.room)
session = model.sessions[0]
session.conversation.item.create(
llm.ChatMessage(
role="user",
content="Please begin the interaction with the user in a manner consistent with your instructions.",
)
)
session.response.create()
def main(livekit_url):
# Workers have to be run as CLIs right now.
# So we need to simualte running "[this file] dev"
# Modify sys.argv to set the path to this file as the first argument
# and 'dev' as the second argument
sys.argv = [str(__file__), 'dev']
# Initialize the worker with the entrypoint
cli.run_app(
WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=livekit_url, port=8082)
)
Loading…
Cancel
Save