LiveKit Pipeline Agent (#4)

* init processors * fix image append for chat messages * add pre tts cb * stash non-functional worker changes * add complete flag parsing worker * stash worker changes * update worker.py to handle message context correctly * draft worker -- currently mismanages chat ctx * stash draft delta changes * working worker on push to talk happy path * final working worker on push to talk * refactor append image * updated video frame processing * rm text processor * working draft main * draft working poetry
11 months ago · 1720b783ce
parent befddaf205
commit 1720b783ce
9 changed files with 2715 additions and 1077 deletions
--- a/software/main.py
+++ b/software/main.py
@ -160,8 +160,7 @@ def run(

        if server == "livekit":

-            ### LIVEKIT SERVER
-
+            ### LIVEKIT SERVER            
            def run_command(command):
                subprocess.run(command, shell=True, check=True)

@ -173,7 +172,7 @@ def run(
            livekit_thread = threading.Thread(
                target=run_command, args=(command,)
            )
-            time.sleep(7)
+
            livekit_thread.start()
            threads.append(livekit_thread)

@ -230,7 +229,6 @@ def run(
    signal.signal(signal.SIGTERM, signal_handler)

    try:
-
        # Verify the server is running
        for attempt in range(10):
            try:
@ -246,9 +244,20 @@ def run(

        ### DISPLAY QR CODE
        if qr:
+            token = str(api.AccessToken('devkey', 'secret') \
+                .with_identity("You") \
+                .with_name("You") \
+                .with_grants(api.VideoGrants(
+                    room_join=True,
+                    room="my-room",
+            )).to_jwt())
+
            def display_qr_code():
                time.sleep(10)
-                content = json.dumps({"livekit_server": url})
+                content = json.dumps({
+                    "livekit_server": url,
+                    "token": token
+                })
                qr_code = segno.make(content)
                qr_code.terminal(compact=True)

@ -266,8 +275,8 @@ def run(
            os.environ['01_STT'] = interpreter.stt

            token = str(api.AccessToken('devkey', 'secret') \
-                .with_identity("identity") \
-                .with_name("my name") \
+                .with_identity("You") \
+                .with_name("You") \
                .with_grants(api.VideoGrants(
                    room_join=True,
                    room="my-room",
--- a/software/poetry.lock
+++ b/software/poetry.lock
--- a/software/pyproject.toml
+++ b/software/pyproject.toml
@ -19,7 +19,7 @@ livekit-plugins-openai = "^0.10.1"
 livekit-plugins-silero = "^0.7.1"
 livekit-plugins-elevenlabs = "^0.7.5"
 segno = "^1.6.1"
-open-interpreter = {extras = ["os", "server"], version = "^0.3.12"} # You should add a "browser" extra, so selenium isn't in the main package
+open-interpreter = {extras = ["os", "server"], version = "^0.3.12"}
 ngrok = "^1.4.0"
 realtimetts = {extras = ["all"], version = "^0.4.5"}
 realtimestt = "^0.2.41"
--- a/software/source/server/livekit/init.py
+++ b/software/source/server/livekit/init.py
--- a/software/source/server/livekit/assistant_functions.py
+++ b/software/source/server/livekit/assistant_functions.py
@ -0,0 +1,48 @@
+import aiohttp
+from typing import Annotated
+from livekit.agents import llm
+
+from datetime import datetime
+
+# Define the path to the log file
+LOG_FILE_PATH = 'assistant_functions.txt'
+
+def log_message(message: str):
+    """Append a message to the log file with a timestamp."""
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    with open(LOG_FILE_PATH, 'a') as log_file:
+        log_file.write(f"{timestamp} - {message}\n")
+
+# first define a class that inherits from llm.FunctionContext
+class AssistantFnc(llm.FunctionContext):
+    # the llm.ai_callable decorator marks this function as a tool available to the LLM
+    # by default, it'll use the docstring as the function's description
+    @llm.ai_callable()
+    async def get_weather(
+        self,
+        # by using the Annotated type, arg description and type are available to the LLM
+        location: Annotated[
+            str, llm.TypeInfo(description="The location to get the weather for")
+        ],
+    ) -> str:
+        """Called when the user asks about the weather. This function will return the weather for the given location."""
+        log_message(f"getting weather for {location}")
+        url = f"https://wttr.in/{location}?format=%C+%t"
+
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                log_message(f"response: {response}")
+                if response.status == 200:
+                    weather_data = await response.text()
+
+                    content: str = f"The weather in {location} is {weather_data}."
+                    log_message(f"content: {content}")
+
+                    # response from the function call is returned to the LLM
+                    # as a tool response. The LLM's response will include this data
+                    return content
+
+                else:
+                    log_message(f"Failed to get weather data, status code: {response.status}")
+                    return f"Failed to get weather data, status code: {response.status}"
+
--- a/software/source/server/livekit/multimodal.py
+++ b/software/source/server/livekit/multimodal.py
@ -82,6 +82,7 @@ async def entrypoint(ctx: JobContext):
    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)

    participant = await ctx.wait_for_participant()
+    fnc_ctx = AssistantFnc()

    openai_api_key = os.getenv("OPENAI_API_KEY")
    model = openai.realtime.RealtimeModel(
--- a/software/source/server/livekit/transcriptions.py
+++ b/software/source/server/livekit/transcriptions.py
@ -0,0 +1,14 @@
+from livekit.agents import stt, transcription
+
+async def _forward_transcription(
+    stt_stream: stt.SpeechStream,
+    stt_forwarder: transcription.STTSegmentsForwarder,
+):
+    """Forward the transcription to the client and log the transcript in the console"""
+    async for ev in stt_stream:
+        stt_forwarder.update(ev)
+        if ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
+            print(ev.alternatives[0].text, end="")
+        elif ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
+            print("\n")
+            print(" -> ", ev.alternatives[0].text)
--- a/software/source/server/livekit/video_processor.py
+++ b/software/source/server/livekit/video_processor.py
@ -0,0 +1,50 @@
+from livekit.rtc import VideoStream
+from livekit.agents import JobContext
+from datetime import datetime
+from livekit.agents.pipeline import VoicePipelineAgent
+
+
+from livekit.rtc import VideoFrame
+import asyncio
+
+# Define the path to the log file
+LOG_FILE_PATH = 'video_processor.txt'
+
+def log_message(message: str):
+    """Append a message to the log file with a timestamp."""
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    with open(LOG_FILE_PATH, 'a') as log_file:
+        log_file.write(f"{timestamp} - {message}\n")
+
+class RemoteVideoProcessor:
+    """Processes video frames from a remote participant's video stream."""
+
+    def __init__(self, video_stream: VideoStream, job_ctx: JobContext):
+        self.video_stream = video_stream
+        self.job_ctx = job_ctx
+        self.current_frame = None  # Store the latest VideoFrame
+        self.lock = asyncio.Lock()
+
+
+    async def process_frames(self):
+        log_message("Starting to process remote video frames.")
+        async for frame_event in self.video_stream:
+            try:
+                video_frame = frame_event.frame
+                timestamp = frame_event.timestamp_us
+                rotation = frame_event.rotation
+
+                # Store the current frame safely
+                log_message(f"Received frame: width={video_frame.width}, height={video_frame.height}, type={video_frame.type}")
+                async with self.lock:
+                    self.current_frame = video_frame
+
+            except Exception as e:
+                log_message(f"Error processing frame: {e}")
+
+    async def get_current_frame(self) -> VideoFrame | None:
+        """Retrieve the current VideoFrame."""
+        log_message("called get current frame")
+        async with self.lock:
+            log_message("retrieving current frame: " + str(self.current_frame))
+            return self.current_frame
--- a/software/source/server/livekit/worker.py
+++ b/software/source/server/livekit/worker.py
@ -1,17 +1,36 @@
 import asyncio
-import copy
+import numpy as np
+import sys
 import os
-from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli
-from livekit.agents.llm import ChatContext, ChatMessage
+from datetime import datetime
+from typing import Literal, Awaitable
+
+from livekit.agents import JobContext, WorkerOptions, cli
+from livekit.agents.transcription import STTSegmentsForwarder
+from livekit.agents.llm import ChatContext
 from livekit import rtc
-from livekit.agents.voice_assistant import VoiceAssistant
+from livekit.agents.pipeline import VoicePipelineAgent
 from livekit.plugins import deepgram, openai, silero, elevenlabs, cartesia
+from livekit.agents.llm.chat_context import ChatContext, ChatImage, ChatMessage
+from livekit.agents.llm import LLMStream
+
+from source.server.livekit.video_processor import RemoteVideoProcessor
+
+from source.server.livekit.transcriptions import _forward_transcription
+
 from dotenv import load_dotenv
-import sys
-import numpy as np

 load_dotenv()

+# Define the path to the log file
+LOG_FILE_PATH = 'worker.txt'
+
+def log_message(message: str):
+    """Append a message to the log file with a timestamp."""
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    with open(LOG_FILE_PATH, 'a') as log_file:
+        log_file.write(f"{timestamp} - {message}\n")
+
 start_message = """Hi! You can hold the white circle below to speak to me.

 Try asking what I can do."""
@ -19,16 +38,25 @@ Try asking what I can do."""
 # This function is the entrypoint for the agent.
 async def entrypoint(ctx: JobContext):
    # Create an initial chat context with a system prompt
-    initial_ctx = ChatContext().append(
+    initial_chat_ctx = ChatContext().append(
        role="system",
        text=(
-            "" # Open Interpreter handles this.
+            "Only take into context the user's image if their message is relevant or pertaining to the image. Otherwise just keep in context that the image is present but do not acknowledge or mention it in your response." # Open Interpreter handles this.
        ),
    )

    # Connect to the LiveKit room
-    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
+    await ctx.connect()
+
+    # Create chat manager
+    chat = rtc.ChatManager(ctx.room)

+    # Initialize RemoteVideoProcessor
+    remote_video_processor = None
+
+    ############################################################
+    # publish agent image
+    ############################################################
    # Create a black background with a white circle
    width, height = 640, 480
    image_np = np.zeros((height, width, 4), dtype=np.uint8)
@ -57,16 +85,15 @@ async def entrypoint(ctx: JobContext):
    # Start publishing the static image
    asyncio.create_task(publish_static_image())

-    # VoiceAssistant is a class that creates a full conversational AI agent.
-    # See https://github.com/livekit/agents/blob/main/livekit-agents/livekit/agents/voice_assistant/assistant.py
-    # for details on how it works.
-
+    ############################################################
+    # initialize voice agent pipeline
+    ############################################################
    interpreter_server_host = os.getenv('INTERPRETER_SERVER_HOST', 'localhost')
    interpreter_server_port = os.getenv('INTERPRETER_SERVER_PORT', '8000')
-    base_url = f"http://{interpreter_server_host}:{interpreter_server_port}/openai"
+    base_url = f"http://{interpreter_server_host}:{interpreter_server_port}/"

    # For debugging
-    # base_url = "http://127.0.0.1:8000/openai"
+    base_url = "http://127.0.0.1:8000/"

    open_interpreter = openai.LLM(
        model="open-interpreter", base_url=base_url, api_key="x"
@ -74,6 +101,8 @@ async def entrypoint(ctx: JobContext):

    tts_provider = os.getenv('01_TTS', '').lower()
    stt_provider = os.getenv('01_STT', '').lower()
+    tts_provider='elevenlabs'
+    stt_provider='deepgram'

    # Add plugins here
    if tts_provider == 'openai':
@ -90,41 +119,225 @@ async def entrypoint(ctx: JobContext):
    else:
        raise ValueError(f"Unsupported STT provider: {stt_provider}. Please set 01_STT environment variable to 'deepgram'.")

-    assistant = VoiceAssistant(
-        vad=silero.VAD.load(),  # Voice Activity Detection
-        stt=stt,  # Speech-to-Text
-        llm=open_interpreter,  # Language Model
-        tts=tts,  # Text-to-Speech
-        chat_ctx=initial_ctx,  # Chat history context
-    )
+    ############################################################
+    # initialize voice assistant states
+    ############################################################
+    push_to_talk = True
+    current_message: ChatMessage = ChatMessage(role='user')
+    submitted_message: ChatMessage = ChatMessage(role='user')
+    video_muted = False

-    chat = rtc.ChatManager(ctx.room)
+    tasks = []
+    ############################################################
+    # before_llm_cb
+    ############################################################
+    def _before_llm_cb(
+        agent: VoicePipelineAgent, 
+        chat_ctx: ChatContext
+    ) -> Awaitable[LLMStream] | Literal[False]:
+        nonlocal push_to_talk
+        nonlocal remote_video_processor
+        nonlocal current_message
+        nonlocal submitted_message
+        log_message(f"[before_llm_cb] chat_ctx before we perform any processing: {chat_ctx}")
+
+
+        if push_to_talk:
+            last_message = chat_ctx.messages[-1]
+
+            if submitted_message and isinstance(last_message.content, str):
+                log_message(f"[before_llm_cb] submitted_message: {submitted_message}")
+
+                # Find where submitted_messages ends in last_message
+                submitted_end_idx = 0 
+                while isinstance(submitted_message.content, str) and submitted_message.content[submitted_end_idx] == last_message.content[submitted_end_idx]:
+                    submitted_end_idx += 1
+                    if submitted_end_idx == len(submitted_message.content):
+                        break
+                
+                # Remove the submitted message from the accumulated messages
+                log_message(f"[before_llm_cb] submitted_end_idx: {submitted_end_idx}")
+
+                # Take messages after the submitted message
+                current_message = ChatMessage(role=last_message.role, content=last_message.content[submitted_end_idx:])
+                log_message(f"[before_llm_cb] current_message after removing submitted_message: {current_message}")
+            else:
+                current_message = ChatMessage(role=last_message.role, content=last_message.content)
+                log_message(f"[before_llm_cb] current_message after removing submitted_message: {current_message}")
+            
+            # Continue without invoking LLM immediately
+            return False  
+        
+        else: 
+            async def process_query():
+                if remote_video_processor and not video_muted:
+                    video_frame = await remote_video_processor.get_current_frame()
+                    if video_frame:
+                        chat_ctx.append(role="user", images=[ChatImage(image=video_frame)])
+                    else:
+                        log_message("[before_llm_cb] No video frame available")
+                    
+                return agent.llm.chat(
+                    chat_ctx=chat_ctx,
+                    fnc_ctx=agent.fnc_ctx,
+                )
+
+            return process_query()
+
+    ############################################################
+    # on_message_received implementation
+    ############################################################
+    async def _on_message_received(msg: str):
+        nonlocal push_to_talk
+        nonlocal remote_video_processor
+        nonlocal current_message
+        nonlocal submitted_message
+
+        if msg == "{COMPLETE}":
+            chat_ctx = assistant.chat_ctx
+            log_message(f"[on_message_received] copied chat_ctx: {chat_ctx}")

-    async def _answer_from_text(text: str):
-        chat_ctx = copy.deepcopy(assistant._chat_ctx)
-        chat_ctx.messages.append(ChatMessage(role="user", content=text))
+            # append image if available
+            if remote_video_processor and not video_muted:
+                video_frame = await remote_video_processor.get_current_frame()
+                if video_frame:
+                    chat_ctx.append(role="user", images=[ChatImage(image=video_frame)]) 
+                    log_message(f"[on_message_received] appended image: {video_frame} to chat_ctx: {chat_ctx}")

-        stream = open_interpreter.chat(chat_ctx=chat_ctx)
-        await assistant.say(stream)
+            if isinstance(current_message.content, str):
+                chat_ctx.append(role=current_message.role, text=current_message.content)

+                # extend existing submitted_message content with the new message content 
+                if submitted_message and isinstance(submitted_message.content, str):
+                    submitted_message.content += current_message.content
+                else:
+                    submitted_message = current_message
+                log_message(f"[on_message_received] appended message: {current_message.content}")
+                log_message(f"[on_message_received] submitted_message is now {submitted_message}")
+                log_message(f"[on_message_received] chat_ctx is now {chat_ctx}")
+            elif isinstance(current_message.content, ChatImage):
+                chat_ctx.append(role=current_message.role, images=[current_message.content])
+                log_message(f"[on_message_received] appended message: {current_message.content}")
+                log_message(f"[on_message_received] submitted_messages is now {submitted_message}")
+                log_message(f"[on_message_received] chat_ctx is now {chat_ctx}")
+            else:
+                log_message(f"[on_message_received] Unsupported message content type: {current_message}")
+            
+            current_message = ChatMessage(role='user')
+            log_message(f"[on_message_received] current_message reset to {current_message}")
+
+            # Generate a response
+            stream = assistant.llm.chat(chat_ctx=chat_ctx)
+            await assistant.say(stream)
+            return
+        
+        if msg == "{REQUIRE_START_ON}":
+            push_to_talk = True
+            return
+
+        if msg == "{REQUIRE_START_OFF}":
+            push_to_talk = False
+            return
+
+        # we copy chat_ctx here to handle the actual message content being sent to the LLM by the user
+        # _on_message_received is called once with the message request and then once with the {COMPLETE} message to trigger the actual LLM call 
+        # so this copy is our default case where we just append the user's message to the chat_ctx
+        chat_ctx = assistant.chat_ctx
+        chat_ctx.append(role="user", text=msg)
+        log_message(f"[on_message_received] appended message: {msg} to chat_ctx: {chat_ctx}")
+
+        return
+
+    ############################################################
+    # on_message_received callback
+    ############################################################
    @chat.on("message_received")
    def on_chat_received(msg: rtc.ChatMessage):
-        if not msg.message:
-            return
-        asyncio.create_task(_answer_from_text(msg.message))
+        log_message(f"Chat message received: {msg.message}")
+        if msg.message:
+            asyncio.create_task(_on_message_received(msg.message))
+
+    ############################################################
+    # transcribe participant track 
+    ############################################################
+    async def transcribe_track(participant: rtc.RemoteParticipant, track: rtc.Track):
+        audio_stream = rtc.AudioStream(track)
+        stt_forwarder = STTSegmentsForwarder(
+            room=ctx.room, participant=participant, track=track
+        )
+        stt_stream = stt.stream()
+        stt_task = asyncio.create_task(
+            _forward_transcription(stt_stream, stt_forwarder)
+        )
+        tasks.append(stt_task)
+
+        async for ev in audio_stream:
+            stt_stream.push_frame(ev.frame)
+
+    ############################################################
+    # on_track_subscribed callback
+    ############################################################
+    @ctx.room.on("track_subscribed")
+    def on_track_subscribed(
+        track: rtc.Track,
+        publication: rtc.TrackPublication,
+        participant: rtc.RemoteParticipant,
+    ):
+        log_message(f"Track subscribed: {track.kind}")

+        if track.kind == rtc.TrackKind.KIND_AUDIO:
+            tasks.append(asyncio.create_task(transcribe_track(participant, track)))
+
+        if track.kind == rtc.TrackKind.KIND_VIDEO:
+            nonlocal remote_video_processor
+
+            remote_video_stream = rtc.VideoStream(track=track, format=rtc.VideoBufferType.RGBA)
+            remote_video_processor = RemoteVideoProcessor(video_stream=remote_video_stream, job_ctx=ctx)
+            log_message("remote video processor." + str(remote_video_processor))
+            asyncio.create_task(remote_video_processor.process_frames())
+
+    ############################################################
+    # on track muted callback
+    ############################################################
+    @ctx.room.on("track_muted")
+    def on_track_muted(participant: rtc.RemoteParticipant, publication: rtc.TrackPublication):
+        nonlocal video_muted
+        if publication.kind == rtc.TrackKind.KIND_VIDEO:
+            video_muted = True
+            log_message(f"Track muted: {publication.kind}")
+
+
+
+    ############################################################
+    # on track unmuted callback
+    ############################################################
+    @ctx.room.on("track_unmuted")
+    def on_track_unmuted(participant: rtc.RemoteParticipant, publication: rtc.TrackPublication):
+        nonlocal video_muted
+        if publication.kind == rtc.TrackKind.KIND_VIDEO:
+            video_muted = False
+            log_message(f"Track unmuted: {publication.kind}")
+
+    ############################################################
    # Start the voice assistant with the LiveKit room
-    assistant.start(ctx.room)
+    ############################################################
+    assistant = VoicePipelineAgent(
+        vad=silero.VAD.load(),
+        stt=stt,
+        llm=open_interpreter,
+        tts=tts,
+        chat_ctx=initial_chat_ctx,
+        before_llm_cb=_before_llm_cb,
+    )

+    assistant.start(ctx.room)
    await asyncio.sleep(1)

    # Greets the user with an initial message
-    await assistant.say(start_message,
-    allow_interruptions=True)
+    await assistant.say(start_message, allow_interruptions=True)


-def main(livekit_url):
-    
+def main(livekit_url: str):
    # Workers have to be run as CLIs right now.
    # So we need to simualte running "[this file] dev"

@ -132,6 +345,7 @@ def main(livekit_url):
    # and 'dev' as the second argument
    sys.argv = [str(__file__), 'dev']

+    # livekit_url = "ws://localhost:7880"
    # Initialize the worker with the entrypoint
    cli.run_app(
        WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=livekit_url)