diff --git a/software/main.py b/software/main.py index 0cf1bf6..1d5dd51 100644 --- a/software/main.py +++ b/software/main.py @@ -258,7 +258,7 @@ def run( ### START LIVEKIT WORKER if server == "livekit": - time.sleep(7) + time.sleep(5) # These are needed to communicate with the worker's entrypoint os.environ['INTERPRETER_SERVER_HOST'] = light_server_host os.environ['INTERPRETER_SERVER_PORT'] = str(light_server_port) @@ -273,7 +273,7 @@ def run( room="my-room", )).to_jwt()) - meet_url = f'https://meet.livekit.io/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n' + # meet_url = f'http://localhost:3000/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n' print("\n") print("For debugging, you can join a video call with your assistant. Click the link below, then send a chat message that says {CONTEXT_MODE_OFF}, then begin speaking:") print(meet_url) diff --git a/software/source/server/livekit/multimodal.py b/software/source/server/livekit/multimodal.py index 08b1a0d..25e3942 100644 --- a/software/source/server/livekit/multimodal.py +++ b/software/source/server/livekit/multimodal.py @@ -11,38 +11,110 @@ from livekit.agents.multimodal import MultimodalAgent from livekit.plugins import openai from dotenv import load_dotenv import os +import time +from typing import Annotated +from livekit.agents import llm + +# Set the environment variable +os.environ['INTERPRETER_TERMINAL_INPUT_PATIENCE'] = '200000' + +instructions = """ +You are Open Interpreter, a world-class programmer that can complete any goal by executing code. +For advanced requests, start by writing a plan. +When you execute code, it will be executed **on the user's machine** in a stateful Jupyter notebook. The user has given you **full permission** to execute any code necessary to complete the task. Execute the code. You CAN run code on the users machine, using the tool you have access to. +You can access the internet. Run **any code** to achieve the goal, and if at first you don't succeed, try again and again. +You can install new packages. +If you modify or create a file, YOU MUST THEN OPEN IT to display it to the user. +Be concise. Do NOT send the user a markdown version of your code — just execute the code instantly. Execute the code! + +You are capable of **any** task. + +You MUST remember to pass into the execute_code function a correct JSON input like {"code": "print('hello world')"} and NOT a raw string or something else. +""" load_dotenv() async def entrypoint(ctx: JobContext): + + from interpreter import interpreter + + def execute_code(code): + print("--- code ---") + print(code) + print("---") + #time.sleep(2) + # Check if the code contains any file deletion commands + if any(keyword in code.lower() for keyword in ['os.remove', 'os.unlink', 'shutil.rmtree', 'delete file', 'rm -']): + print("Warning: File deletion commands detected. Execution aborted for safety.") + return "Execution aborted: File deletion commands are not allowed." + print("--- output ---") + output = "" + for chunk in interpreter.computer.run("python", code): + if "content" in chunk and type(chunk["content"]) == str: + output += "\n" + chunk["content"] + print(chunk["content"]) + print("---") + + output = output.strip() + + if output == "": + output = "No output was produced by running this code." + return output + + + # first define a class that inherits from llm.FunctionContext + class AssistantFnc(llm.FunctionContext): + # the llm.ai_callable decorator marks this function as a tool available to the LLM + # by default, it'll use the docstring as the function's description + @llm.ai_callable() + async def execute( + self, + # by using the Annotated type, arg description and type are available to the LLM + code: Annotated[ + str, llm.TypeInfo(description="The Python code to execute") + ], + ): + """Executes Python and returns the output""" + return execute_code(code) + + fnc_ctx = AssistantFnc() + await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) participant = await ctx.wait_for_participant() openai_api_key = os.getenv("OPENAI_API_KEY") model = openai.realtime.RealtimeModel( - instructions="You are a helpful assistant and you love open-source software", + instructions=instructions, voice="shimmer", - temperature=0.8, + temperature=0.6, modalities=["audio", "text"], api_key=openai_api_key, base_url="wss://api.openai.com/v1", ) - assistant = MultimodalAgent(model=model) + model._fnc_ctx = fnc_ctx + assistant = MultimodalAgent(model=model, fnc_ctx=fnc_ctx) + assistant.start(ctx.room) - session = model.sessions[0] + # Create a session with the function context + session = model.session( + chat_ctx=llm.ChatContext(), + fnc_ctx=fnc_ctx, + ) + + # Initial message to start the interaction session.conversation.item.create( llm.ChatMessage( role="user", - content="Please begin the interaction with the user in a manner consistent with your instructions.", + content="Hello!", ) ) session.response.create() def main(livekit_url): # Workers have to be run as CLIs right now. - # So we need to simualte running "[this file] dev" + # So we need to simulate running "[this file] dev" # Modify sys.argv to set the path to this file as the first argument # and 'dev' as the second argument