|
|
@ -7,9 +7,15 @@ from livekit import rtc
|
|
|
|
from livekit.agents.voice_assistant import VoiceAssistant
|
|
|
|
from livekit.agents.voice_assistant import VoiceAssistant
|
|
|
|
from livekit.plugins import deepgram, openai, silero, elevenlabs
|
|
|
|
from livekit.plugins import deepgram, openai, silero, elevenlabs
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
load_dotenv()
|
|
|
|
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start_message = """Hi! You can hold the white circle below to speak to me.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Try asking what I can do."""
|
|
|
|
|
|
|
|
|
|
|
|
# This function is the entrypoint for the agent.
|
|
|
|
# This function is the entrypoint for the agent.
|
|
|
|
async def entrypoint(ctx: JobContext):
|
|
|
|
async def entrypoint(ctx: JobContext):
|
|
|
|
# Create an initial chat context with a system prompt
|
|
|
|
# Create an initial chat context with a system prompt
|
|
|
@ -23,17 +29,47 @@ async def entrypoint(ctx: JobContext):
|
|
|
|
# Connect to the LiveKit room
|
|
|
|
# Connect to the LiveKit room
|
|
|
|
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
|
|
|
|
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Create a black background with a white circle
|
|
|
|
|
|
|
|
width, height = 640, 480
|
|
|
|
|
|
|
|
image_np = np.zeros((height, width, 4), dtype=np.uint8)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Create a white circle
|
|
|
|
|
|
|
|
center = (width // 2, height // 2)
|
|
|
|
|
|
|
|
radius = 50
|
|
|
|
|
|
|
|
y, x = np.ogrid[:height, :width]
|
|
|
|
|
|
|
|
mask = ((x - center[0])**2 + (y - center[1])**2) <= radius**2
|
|
|
|
|
|
|
|
image_np[mask] = [255, 255, 255, 255] # White color with full opacity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
source = rtc.VideoSource(width, height)
|
|
|
|
|
|
|
|
track = rtc.LocalVideoTrack.create_video_track("static_image", source)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
options = rtc.TrackPublishOptions()
|
|
|
|
|
|
|
|
options.source = rtc.TrackSource.SOURCE_CAMERA
|
|
|
|
|
|
|
|
publication = await ctx.room.local_participant.publish_track(track, options)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Function to continuously publish the static image
|
|
|
|
|
|
|
|
async def publish_static_image():
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
|
|
|
frame = rtc.VideoFrame(width, height, rtc.VideoBufferType.RGBA, image_np.tobytes())
|
|
|
|
|
|
|
|
source.capture_frame(frame)
|
|
|
|
|
|
|
|
await asyncio.sleep(1/30) # Publish at 30 fps
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Start publishing the static image
|
|
|
|
|
|
|
|
asyncio.create_task(publish_static_image())
|
|
|
|
|
|
|
|
|
|
|
|
# VoiceAssistant is a class that creates a full conversational AI agent.
|
|
|
|
# VoiceAssistant is a class that creates a full conversational AI agent.
|
|
|
|
# See https://github.com/livekit/agents/blob/main/livekit-agents/livekit/agents/voice_assistant/assistant.py
|
|
|
|
# See https://github.com/livekit/agents/blob/main/livekit-agents/livekit/agents/voice_assistant/assistant.py
|
|
|
|
# for details on how it works.
|
|
|
|
# for details on how it works.
|
|
|
|
|
|
|
|
|
|
|
|
interpreter_server_host = os.getenv('INTERPRETER_SERVER_HOST', '0.0.0.0')
|
|
|
|
interpreter_server_host = os.getenv('INTERPRETER_SERVER_HOST', 'localhost')
|
|
|
|
interpreter_server_port = os.getenv('INTERPRETER_LIGHT_SERVER_PORT', '8000')
|
|
|
|
interpreter_server_port = os.getenv('INTERPRETER_SERVER_PORT', '8000')
|
|
|
|
|
|
|
|
|
|
|
|
base_url = f"http://{interpreter_server_host}:{interpreter_server_port}/openai"
|
|
|
|
base_url = f"http://{interpreter_server_host}:{interpreter_server_port}/openai"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# For debugging
|
|
|
|
|
|
|
|
# base_url = "http://127.0.0.1:8000/openai"
|
|
|
|
|
|
|
|
|
|
|
|
open_interpreter = openai.LLM(
|
|
|
|
open_interpreter = openai.LLM(
|
|
|
|
model="open-interpreter", base_url=base_url
|
|
|
|
model="open-interpreter", base_url=base_url, api_key="x"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
assistant = VoiceAssistant(
|
|
|
|
assistant = VoiceAssistant(
|
|
|
@ -65,13 +101,20 @@ async def entrypoint(ctx: JobContext):
|
|
|
|
await asyncio.sleep(1)
|
|
|
|
await asyncio.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
# Greets the user with an initial message
|
|
|
|
# Greets the user with an initial message
|
|
|
|
await assistant.say("""Hi! You can hold the white circle below to speak to me.
|
|
|
|
await assistant.say(start_message,
|
|
|
|
|
|
|
|
allow_interruptions=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Try asking what I can do.""", allow_interruptions=True)
|
|
|
|
def main(livekit_url):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Workers have to be run as CLIs right now.
|
|
|
|
|
|
|
|
# So we need to simualte running "[this file] dev"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Modify sys.argv to set the path to this file as the first argument
|
|
|
|
|
|
|
|
# and 'dev' as the second argument
|
|
|
|
|
|
|
|
sys.argv = [str(__file__), 'dev']
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
# Initialize the worker with the entrypoint
|
|
|
|
# Initialize the worker with the entrypoint
|
|
|
|
cli.run_app(
|
|
|
|
cli.run_app(
|
|
|
|
WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=os.getenv("LIVEKIT_URL"))
|
|
|
|
WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=livekit_url)
|
|
|
|
)
|
|
|
|
)
|