import asyncio import copy import os import re from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli from livekit.agents.transcription import STTSegmentsForwarder from livekit.agents.llm import ChatContext, ChatMessage, LLMStream, ChatChunk, ChoiceDelta, Choice from livekit import rtc from livekit.agents.voice_assistant import VoiceAssistant from livekit.plugins import deepgram, openai, silero, elevenlabs from dotenv import load_dotenv import sys import numpy as np from typing import AsyncIterator load_dotenv() start_message = """Hi! You can hold the white circle below to speak to me. Try asking what I can do.""" class ProcessedLLMStream(LLMStream): def __init__( self, original_stream: LLMStream, regex_pattern: str = r'', ) -> None: super().__init__(chat_ctx=original_stream.chat_ctx, fnc_ctx=original_stream.fnc_ctx) self.original_stream = original_stream self.regex_pattern = regex_pattern self.init_match = '<.*?' # match for the '<' and any characters to the left of it self.accumulating = False self._aiter = self._process_stream() self._buffer = "" async def _process_stream(self) -> AsyncIterator[ChatChunk]: async for chunk in self.original_stream: new_choices = [] for choice in chunk.choices: content = choice.delta.content if content: init_match = re.search(self.init_match, content) if init_match: print("INITIAL MATCH FOUND!!!!!!") print("INITIAL MATCH FOUND!!!!!!") print("INITIAL MATCH FOUND!!!!!!") print("INITIAL MATCH FOUND!!!!!!") print("INITIAL MATCH FOUND!!!!!!") print("INITIAL MATCH FOUND!!!!!!") print("INITIAL MATCH FOUND!!!!!!") self.accumulating = True if self.accumulating: self._buffer += content print("ACCUMULATING BUFFER!!!") print("ACCUMULATING BUFFER!!!") print("ACCUMULATING BUFFER!!!") print("ACCUMULATING BUFFER!!!") print("ACCUMULATING BUFFER!!!") print("ACCUMULATING BUFFER!!!") match = re.search(self.regex_pattern, self._buffer) if match: code = match.group(1) print(f"Extracted Code: {code}") # Create a confirmation message confirmation_msg = ChatMessage( role="assistant", content=f"Code extracted: {code}", ) # Wrap the confirmation message in ChoiceDelta and Choice choice_delta = ChoiceDelta( role=confirmation_msg.role, content=str(confirmation_msg.content) # we know confirmation_msg.content is a string ) new_choice = Choice( delta=choice_delta, index=choice.index ) # Create a new ChatChunk with the confirmation Choice confirmation_chunk = ChatChunk(choices=[new_choice]) # Yield the confirmation chunk yield confirmation_chunk self.accumulating = False self._buffer = "" continue # Skip yielding the original content new_choices.append(choice) if new_choices: yield ChatChunk(choices=new_choices) async def __anext__(self) -> ChatChunk: try: return await self._aiter.__anext__() except StopAsyncIteration: await self.aclose() raise def _01_synthesize_assistant_reply( assistant: VoiceAssistant, chat_ctx: ChatContext ) -> LLMStream: """ Custom function to process the OpenAI compatible endpoint's output. Extracts code from responses matching the pattern. Args: assistant (VoiceAssistant): The VoiceAssistant instance. chat_ctx (ChatContext): The current chat context. Returns: LLMStream: The processed LLMStream. """ llm_stream = assistant.llm.chat(chat_ctx=chat_ctx, fnc_ctx=assistant.fnc_ctx) print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") print("HELLO FROM INSIDE OUR CUSTOM LLM STREAM") return ProcessedLLMStream(original_stream=llm_stream) # This function is the entrypoint for the agent. async def entrypoint(ctx: JobContext): # Create an initial chat context with a system prompt initial_ctx = ChatContext().append( role="system", text=( "" # Open Interpreter handles this. ), ) # Connect to the LiveKit room await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) # Create a black background with a white circle width, height = 640, 480 image_np = np.zeros((height, width, 4), dtype=np.uint8) # Create a white circle center = (width // 2, height // 2) radius = 50 y, x = np.ogrid[:height, :width] mask = ((x - center[0])**2 + (y - center[1])**2) <= radius**2 image_np[mask] = [255, 255, 255, 255] # White color with full opacity source = rtc.VideoSource(width, height) track = rtc.LocalVideoTrack.create_video_track("static_image", source) options = rtc.TrackPublishOptions() options.source = rtc.TrackSource.SOURCE_CAMERA publication = await ctx.room.local_participant.publish_track(track, options) # Function to continuously publish the static image async def publish_static_image(): while True: frame = rtc.VideoFrame(width, height, rtc.VideoBufferType.RGBA, image_np.tobytes()) source.capture_frame(frame) await asyncio.sleep(1/30) # Publish at 30 fps # Start publishing the static image asyncio.create_task(publish_static_image()) # VoiceAssistant is a class that creates a full conversational AI agent. # See https://github.com/livekit/agents/blob/main/livekit-agents/livekit/agents/voice_assistant/assistant.py # for details on how it works. interpreter_server_host = os.getenv('INTERPRETER_SERVER_HOST', 'localhost') interpreter_server_port = os.getenv('INTERPRETER_SERVER_PORT', '8000') base_url = f"http://{interpreter_server_host}:{interpreter_server_port}/openai" # For debugging # base_url = "http://127.0.0.1:8000/openai" open_interpreter = openai.LLM( model="open-interpreter", base_url=base_url, api_key="x" ) tts_provider = os.getenv('01_TTS', '').lower() stt_provider = os.getenv('01_STT', '').lower() # Add plugins here if tts_provider == 'openai': tts = openai.TTS() elif tts_provider == 'elevenlabs': tts = elevenlabs.TTS() elif tts_provider == 'cartesia': pass # import plugin, TODO support this else: raise ValueError(f"Unsupported TTS provider: {tts_provider}. Please set 01_TTS environment variable to 'openai' or 'elevenlabs'.") if stt_provider == 'deepgram': stt = deepgram.STT() else: raise ValueError(f"Unsupported STT provider: {stt_provider}. Please set 01_STT environment variable to 'deepgram'.") assistant = VoiceAssistant( vad=silero.VAD.load(), # Voice Activity Detection stt=stt, # Speech-to-Text llm=open_interpreter, # Language Model tts=tts, # Text-to-Speech chat_ctx=initial_ctx, # Chat history context # will_synthesize_assistant_reply=_01_synthesize_assistant_reply, ) chat = rtc.ChatManager(ctx.room) async def _answer_from_text(text: str): chat_ctx = copy.deepcopy(assistant._chat_ctx) chat_ctx.messages.append(ChatMessage(role="user", content=text)) stream = open_interpreter.chat(chat_ctx=chat_ctx) await assistant.say(stream) @chat.on("message_received") def on_chat_received(msg: rtc.ChatMessage): if not msg.message: return asyncio.create_task(_answer_from_text(msg.message)) # Start the voice assistant with the LiveKit room assistant.start(ctx.room) await asyncio.sleep(1) print("HELLO FROM INSIDE THE WORKER") print("HELLO FROM INSIDE THE WORKER") print("HELLO FROM INSIDE THE WORKER") print("HELLO FROM INSIDE THE WORKER") print("HELLO FROM INSIDE THE WORKER") # Greets the user with an initial message await assistant.say(start_message, allow_interruptions=True) stt_forwarder = STTSegmentsForwarder(room=ctx.room, participant=ctx.room.local_participant) await stt_forwarder._run() def main(livekit_url): print("Starting worker!!!!!!! 🦅🦅🦅🦅🦅🦅") print("Starting worker!!!!!!! 🦅🦅🦅🦅🦅🦅") print("Starting worker!!!!!!! 🦅🦅🦅🦅🦅🦅") print("Starting worker!!!!!!! 🦅🦅🦅🦅🦅🦅") print("Starting worker!!!!!!! 🦅🦅🦅🦅🦅🦅") # Workers have to be run as CLIs right now. # So we need to simualte running "[this file] dev" # Modify sys.argv to set the path to this file as the first argument # and 'dev' as the second argument sys.argv = [str(__file__), 'dev'] # Initialize the worker with the entrypoint cli.run_app( WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=livekit_url, port=8082) )