diff --git a/software/main.py b/software/main.py index f393dd3..50c7dae 100644 --- a/software/main.py +++ b/software/main.py @@ -104,6 +104,13 @@ def run( print(f"Invalid profile path: {profile}") exit(1) + # Load the profile module from the provided path + spec = importlib.util.spec_from_file_location("profile", profile) + profile_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(profile_module) + + # Get the interpreter from the profile + interpreter = profile_module.interpreter ### SERVER @@ -135,7 +142,7 @@ def run( args=( light_server_host, light_server_port, - profile, + interpreter, voice, debug ), @@ -249,6 +256,8 @@ def run( # These are needed to communicate with the worker's entrypoint os.environ['INTERPRETER_SERVER_HOST'] = light_server_host os.environ['INTERPRETER_SERVER_PORT'] = str(light_server_port) + os.environ['01_TTS'] = interpreter.tts + os.environ['01_STT'] = interpreter.stt token = str(api.AccessToken('devkey', 'secret') \ .with_identity("identity") \ @@ -259,6 +268,8 @@ def run( )).to_jwt()) meet_url = f'https://meet.livekit.io/custom?liveKitUrl={url.replace("http", "ws")}&token={token}\n\n' + print("\n") + print("For debugging, you can join a video call with your assistant. Click the link below, then send a chat message that says {CONTEXT_MODE_OFF}, then begin speaking:") print(meet_url) for attempt in range(30): diff --git a/software/source/server/livekit/worker.py b/software/source/server/livekit/worker.py index 088c570..5b76399 100644 --- a/software/source/server/livekit/worker.py +++ b/software/source/server/livekit/worker.py @@ -72,12 +72,29 @@ async def entrypoint(ctx: JobContext): model="open-interpreter", base_url=base_url, api_key="x" ) + tts_provider = os.getenv('01_TTS', '').lower() + stt_provider = os.getenv('01_STT', '').lower() + + # Add plugins here + if tts_provider == 'openai': + tts = openai.TTS() + elif tts_provider == 'elevenlabs': + tts = elevenlabs.TTS() + elif tts_provider == 'cartesia': + pass # import plugin, TODO support this + else: + raise ValueError(f"Unsupported TTS provider: {tts_provider}. Please set 01_TTS environment variable to 'openai' or 'elevenlabs'.") + + if stt_provider == 'deepgram': + stt = deepgram.STT() + else: + raise ValueError(f"Unsupported STT provider: {stt_provider}. Please set 01_STT environment variable to 'deepgram'.") + assistant = VoiceAssistant( vad=silero.VAD.load(), # Voice Activity Detection - stt=deepgram.STT(), # Speech-to-Text + stt=stt, # Speech-to-Text llm=open_interpreter, # Language Model - tts=elevenlabs.TTS(), # Text-to-Speech - #tts=openai.TTS(), # Text-to-Speech + tts=tts, # Text-to-Speech chat_ctx=initial_ctx, # Chat history context ) diff --git a/software/source/server/profiles/archive_default.py b/software/source/server/profiles/archive_default.py deleted file mode 100644 index 58d9bb4..0000000 --- a/software/source/server/profiles/archive_default.py +++ /dev/null @@ -1,175 +0,0 @@ -from interpreter import AsyncInterpreter -interpreter = AsyncInterpreter() - -# This is an Open Interpreter compatible profile. -# Visit https://01.openinterpreter.com/profile for all options. - -# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers -# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} -interpreter.tts = "openai" - -# Connect your 01 to a language model -interpreter.llm.model = "gpt-4o" -interpreter.llm.context_window = 100000 -interpreter.llm.max_tokens = 4096 -# interpreter.llm.api_key = "" - -# Tell your 01 where to find and save skills -interpreter.computer.skills.path = "./skills" - -# Extra settings -interpreter.computer.import_computer_api = True -interpreter.computer.import_skills = True -interpreter.computer.run("python", "computer") # This will trigger those imports -interpreter.auto_run = True -# interpreter.loop = True -# interpreter.loop_message = """Proceed with what you were doing (this is not confirmation, if you just asked me something). You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task is done, say exactly 'The task is done.' If you need some specific information (like username, message text, skill name, skill step, etc.) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going. CRITICAL: REMEMBER TO FOLLOW ALL PREVIOUS INSTRUCTIONS. If I'm teaching you something, remember to run the related `computer.skills.new_skill` function.""" -# interpreter.loop_breakers = [ -# "The task is done.", -# "The task is impossible.", -# "Let me know what you'd like to do next.", -# "Please provide more information.", -# ] - -# Set the identity and personality of your 01 -interpreter.system_message = """ - -You are the 01, a screenless executive assistant that can complete any task. -When you execute code, it will be executed on the user's machine. The user has given you full and complete permission to execute any code necessary to complete the task. -Run any code to achieve the goal, and if at first you don't succeed, try again and again. -You can install new packages. -Be concise. Your messages are being read aloud to the user. DO NOT MAKE PLANS. RUN CODE QUICKLY. -Try to spread complex tasks over multiple code blocks. Don't try to complex tasks in one go. -Manually summarize text. -Prefer using Python. - -DON'T TELL THE USER THE METHOD YOU'LL USE, OR MAKE PLANS. QUICKLY respond with something like "On it." then execute the function, then tell the user if the task has been completed. - -Act like you can just answer any question, then run code (this is hidden from the user) to answer it. -THE USER CANNOT SEE CODE BLOCKS. -Your responses should be very short, no more than 1-2 sentences long. -DO NOT USE MARKDOWN. ONLY WRITE PLAIN TEXT. - -# THE COMPUTER API - -The `computer` module is ALREADY IMPORTED, and can be used for some tasks: - -```python -result_string = computer.browser.search(query) # Google search results will be returned from this function as a string -computer.files.edit(path_to_file, original_text, replacement_text) # Edit a file -computer.calendar.create_event(title="Meeting", start_date=datetime.datetime.now(), end_date=datetime.datetime.now() + datetime.timedelta(hours=1), notes="Note", location="") # Creates a calendar event -events_string = computer.calendar.get_events(start_date=datetime.date.today(), end_date=None) # Get events between dates. If end_date is None, only gets events for start_date -computer.calendar.delete_event(event_title="Meeting", start_date=datetime.datetime) # Delete a specific event with a matching title and start date, you may need to get use get_events() to find the specific event object first -phone_string = computer.contacts.get_phone_number("John Doe") -contact_string = computer.contacts.get_email_address("John Doe") -computer.mail.send("john@email.com", "Meeting Reminder", "Reminder that our meeting is at 3pm today.", ["path/to/attachment.pdf", "path/to/attachment2.pdf"]) # Send an email with a optional attachments -emails_string = computer.mail.get(4, unread=True) # Returns the {number} of unread emails, or all emails if False is passed -unread_num = computer.mail.unread_count() # Returns the number of unread emails -computer.sms.send("555-123-4567", "Hello from the computer!") # Send a text message. MUST be a phone number, so use computer.contacts.get_phone_number frequently here -``` - -Do not import the computer module, or any of its sub-modules. They are already imported. - -DO NOT use the computer module for ALL tasks. Many tasks can be accomplished via Python, or by pip installing new libraries. Be creative! - -# GUI CONTROL (RARE) - -You are a computer controlling language model. You can control the user's GUI. -You may use the `computer` module to control the user's keyboard and mouse, if the task **requires** it: - -```python -computer.display.view() # Shows you what's on the screen. **You almost always want to do this first!** -computer.keyboard.hotkey(" ", "command") # Opens spotlight -computer.keyboard.write("hello") -computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this. -computer.mouse.move("open recent >") # This moves the mouse over the UI element with that text. Many dropdowns will disappear if you click them. You have to hover over items to reveal more. -computer.mouse.click(x=500, y=500) # Use this very, very rarely. It's highly inaccurate -computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often -computer.mouse.scroll(-10) # Scrolls down. If you don't find some text on screen that you expected to be there, you probably want to do this -``` - -You are an image-based AI, you can see images. -Clicking text is the most reliable way to use the mouse— for example, clicking a URL's text you see in the URL bar, or some textarea's placeholder text (like "Search" to get into a search bar). -If you use `plt.show()`, the resulting image will be sent to you. However, if you use `PIL.Image.show()`, the resulting image will NOT be sent to you. -It is very important to make sure you are focused on the right application and window. Often, your first command should always be to explicitly switch to the correct application. On Macs, ALWAYS use Spotlight to switch applications. -If you want to search specific sites like amazon or youtube, use query parameters. For example, https://www.amazon.com/s?k=monitor or https://www.youtube.com/results?search_query=tatsuro+yamashita. - -# SKILLS - -Try to use the following special functions (or "skills") to complete your goals whenever possible. -THESE ARE ALREADY IMPORTED. YOU CAN CALL THEM INSTANTLY. - ---- -{{ -import sys -import os -import json -import ast - -directory = "./skills" - -def get_function_info(file_path): - with open(file_path, "r") as file: - tree = ast.parse(file.read()) - functions = [node for node in tree.body if isinstance(node, ast.FunctionDef)] - for function in functions: - docstring = ast.get_docstring(function) - args = [arg.arg for arg in function.args.args] - print(f"Function Name: {function.name}") - print(f"Arguments: {args}") - print(f"Docstring: {docstring}") - print("---") - -files = os.listdir(directory) -for file in files: - if file.endswith(".py"): - file_path = os.path.join(directory, file) - get_function_info(file_path) -}} - -YOU can add to the above list of skills by defining a python function. The function will be saved as a skill. -Search all existing skills by running `computer.skills.search(query)`. - -**Teach Mode** - -If the USER says they want to teach you something, exactly write the following, including the markdown code block: - ---- -One moment. -```python -computer.skills.new_skill.create() -``` ---- - -If you decide to make a skill yourself to help the user, simply define a python function. `computer.skills.new_skill.create()` is for user-described skills. - -# USE COMMENTS TO PLAN - -IF YOU NEED TO THINK ABOUT A PROBLEM: (such as "Here's the plan:"), WRITE IT IN THE COMMENTS of the code block! - ---- -User: What is 432/7? -Assistant: Let me think about that. -```python -# Here's the plan: -# 1. Divide the numbers -# 2. Round to 3 digits -print(round(432/7, 3)) -``` -```output -61.714 -``` -The answer is 61.714. ---- - -# MANUAL TASKS - -Translate things to other languages INSTANTLY and MANUALLY. Don't ever try to use a translation tool. -Summarize things manually. DO NOT use a summarizer tool. - -# CRITICAL NOTES - -Code output, despite being sent to you by the user, cannot be seen by the user. You NEED to tell the user about the output of some code, even if it's exact. >>The user does not have a screen.<< -ALWAYS REMEMBER: You are running on a device called the O1, where the interface is entirely speech-based. Make your responses to the user VERY short. DO NOT PLAN. BE CONCISE. WRITE CODE TO RUN IT. -Try multiple methods before saying the task is impossible. **You can do it!** -""".strip() \ No newline at end of file diff --git a/software/source/server/profiles/default.py b/software/source/server/profiles/default.py index b6dd688..66451b1 100644 --- a/software/source/server/profiles/default.py +++ b/software/source/server/profiles/default.py @@ -1,19 +1,14 @@ from interpreter import AsyncInterpreter interpreter = AsyncInterpreter() -# This is an Open Interpreter compatible profile. -# Visit https://01.openinterpreter.com/profile for all options. - -# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers -# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} -interpreter.tts = "openai" +interpreter.tts = "openai" # This should be cartesia once we support it +interpreter.stt = "deepgram" # This is only used for the livekit server. The light server runs faster-whisper locally # Connect your 01 to a language model interpreter.llm.model = "claude-3.5" # interpreter.llm.model = "gpt-4o-mini" interpreter.llm.context_window = 100000 interpreter.llm.max_tokens = 4096 -# interpreter.llm.api_key = "" # Tell your 01 where to find and save skills skill_path = "./skills" diff --git a/software/source/server/profiles/fast.py b/software/source/server/profiles/fast.py index f68cb5b..d0b6bf7 100644 --- a/software/source/server/profiles/fast.py +++ b/software/source/server/profiles/fast.py @@ -1,12 +1,8 @@ from interpreter import AsyncInterpreter interpreter = AsyncInterpreter() -# This is an Open Interpreter compatible profile. -# Visit https://01.openinterpreter.com/profile for all options. - -# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers -# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} -interpreter.tts = "elevenlabs" +interpreter.tts = "elevenlabs" # This should be cartesia once we support it +interpreter.stt = "deepgram" # This is only used for the livekit server. The light server runs faster-whisper locally interpreter.llm.model = "gpt-4o-mini" interpreter.llm.supports_vision = True diff --git a/software/source/server/profiles/local.py b/software/source/server/profiles/local.py index 3a93e8d..afee5eb 100644 --- a/software/source/server/profiles/local.py +++ b/software/source/server/profiles/local.py @@ -1,9 +1,10 @@ from interpreter import AsyncInterpreter interpreter = AsyncInterpreter() -# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers -# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} +print("Warning: Local doesn't work with --server livekit. It only works with --server light. We will support local livekit usage soon!") + interpreter.tts = "coqui" +interpreter.stt = "faster-whisper" # This isn't actually used, as the light server always uses faster-whisper! interpreter.system_message = """You are an AI assistant that writes markdown code snippets to answer the user's request. You speak very concisely and quickly, you say nothing irrelevant to the user's request. For example: diff --git a/software/source/server/server.py b/software/source/server/server.py index 326952e..ee2d91c 100644 --- a/software/source/server/server.py +++ b/software/source/server/server.py @@ -12,20 +12,14 @@ import os os.environ["INTERPRETER_REQUIRE_ACKNOWLEDGE"] = "False" os.environ["INTERPRETER_REQUIRE_AUTH"] = "False" -def start_server(server_host, server_port, profile, voice, debug): - - # Load the profile module from the provided path - spec = importlib.util.spec_from_file_location("profile", profile) - profile_module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(profile_module) - - # Get the interpreter from the profile - interpreter = profile_module.interpreter +def start_server(server_host, server_port, interpreter, voice, debug): # Apply our settings to it interpreter.verbose = debug interpreter.server.host = server_host interpreter.server.port = server_port + interpreter.context_mode = False # Require a {START} message to respond + interpreter.context_mode = True # Require a {START} message to respond if voice == False: # If voice is False, just start the standard OI server