Updated OI, added try

1 year ago · 83b776bd1a
parent a82b4b264f
commit 83b776bd1a
4 changed files with 241 additions and 3102 deletions
--- a/01OS/01OS/server/server.py
+++ b/01OS/01OS/server/server.py
@ -1,6 +1,7 @@
 from dotenv import load_dotenv
 load_dotenv()  # take environment variables from .env.

+import traceback
 from platformdirs import user_data_dir
 import ast
 import json
@ -183,136 +184,138 @@ async def send_messages(websocket: WebSocket):
 async def listener():

    while True:
-        while True:
-            if not from_user.empty():
-                chunk = await from_user.get()
-                break
-            elif not from_computer.empty():
-                chunk = from_computer.get()
-                break
-            await asyncio.sleep(1)
-
-        
-
-        message = accumulator.accumulate(chunk)
-        if message == None:
-            # Will be None until we have a full message ready
-            continue
-
-        # print(str(message)[:1000])
+        try:
+            while True:
+                if not from_user.empty():
+                    chunk = await from_user.get()
+                    break
+                elif not from_computer.empty():
+                    chunk = from_computer.get()
+                    break
+                await asyncio.sleep(1)

-        # At this point, we have our message
            
-        if message["type"] == "audio" and message["format"].startswith("bytes"):

-            if not message["content"]: # If it was nothing / silence
+            message = accumulator.accumulate(chunk)
+            if message == None:
+                # Will be None until we have a full message ready
                continue

-            # Convert bytes to audio file
-            # Format will be bytes.wav or bytes.opus
-            mime_type = "audio/" + message["format"].split(".")[1]
-            audio_file_path = bytes_to_wav(message["content"], mime_type)
-
-            # For microphone debugging:
-            if False:
-                os.system(f"open {audio_file_path}")
-                import time
-                time.sleep(15)
-
-            text = stt(audio_file_path)
-            print("> ", text)
-            message = {"role": "user", "type": "message", "content": text}
-
-        # At this point, we have only text messages
+            # print(str(message)[:1000])

-        if type(message["content"]) != str:
-            print("This should be a string, but it's not:", message["content"])
-            message["content"] = message["content"].decode()
+            # At this point, we have our message

-        # Custom stop message will halt us
-        if message["content"].lower().strip(".,! ") == "stop":
-            continue
+            if message["type"] == "audio" and message["format"].startswith("bytes"):

-        # Load, append, and save conversation history
-        with open(conversation_history_path, 'r') as file:
-            messages = json.load(file)
-        messages.append(message)
-        with open(conversation_history_path, 'w') as file:
-            json.dump(messages, file, indent=4)
+                if "content" not in message or message["content"] == None: # If it was nothing / silence
+                    continue

-        accumulated_text = ""
+                # Convert bytes to audio file
+                # Format will be bytes.wav or bytes.opus
+                mime_type = "audio/" + message["format"].split(".")[1]
+                audio_file_path = bytes_to_wav(message["content"], mime_type)

+                # For microphone debugging:
+                if False:
+                    os.system(f"open {audio_file_path}")
+                    import time
+                    time.sleep(15)

-        if any([m["type"] == "image" for m in messages]) and interpreter.llm.model.startswith("gpt-"):
-            interpreter.llm.model = "gpt-4-vision-preview"
-            interpreter.llm.supports_vision = True
+                text = stt(audio_file_path)
+                print("> ", text)
+                message = {"role": "user", "type": "message", "content": text}

-        for chunk in interpreter.chat(messages, stream=True, display=True):
+            # At this point, we have only text messages

-            if any([m["type"] == "image" for m in interpreter.messages]):
-                interpreter.llm.model = "gpt-4-vision-preview"
+            if type(message["content"]) != str:
+                print("This should be a string, but it's not:", message["content"])
+                message["content"] = message["content"].decode()

-            logger.debug("Got chunk:", chunk)
+            # Custom stop message will halt us
+            if message["content"].lower().strip(".,! ") == "stop":
+                continue

-            # Send it to the user
-            await to_device.put(chunk)
-            # Yield to the event loop, so you actually send it out
-            await asyncio.sleep(0.01)
+            # Load, append, and save conversation history
+            with open(conversation_history_path, 'r') as file:
+                messages = json.load(file)
+            messages.append(message)
+            with open(conversation_history_path, 'w') as file:
+                json.dump(messages, file, indent=4)

-            if os.getenv('TTS_RUNNER') == "server":
-                # Speak full sentences out loud
-                if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
-                    accumulated_text += chunk["content"]
-                    sentences = split_into_sentences(accumulated_text)
+            accumulated_text = ""

-                    # If we're going to speak, say we're going to stop sending text.
-                    # This should be fixed probably, we should be able to do both in parallel, or only one.
-                    if any(is_full_sentence(sentence) for sentence in sentences):
-                        await to_device.put({"role": "assistant", "type": "message", "end": True})

-                    if is_full_sentence(sentences[-1]):
-                        for sentence in sentences:
-                            await stream_tts_to_device(sentence)
-                        accumulated_text = ""
+            if any([m["type"] == "image" for m in messages]) and interpreter.llm.model.startswith("gpt-"):
+                interpreter.llm.model = "gpt-4-vision-preview"
+                interpreter.llm.supports_vision = True
+            
+            for chunk in interpreter.chat(messages, stream=True, display=True):
+
+                if any([m["type"] == "image" for m in interpreter.messages]):
+                    interpreter.llm.model = "gpt-4-vision-preview"
+
+                logger.debug("Got chunk:", chunk)
+
+                # Send it to the user
+                await to_device.put(chunk)
+                # Yield to the event loop, so you actually send it out
+                await asyncio.sleep(0.01)
+                
+                if os.getenv('TTS_RUNNER') == "server":
+                    # Speak full sentences out loud
+                    if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
+                        accumulated_text += chunk["content"]
+                        sentences = split_into_sentences(accumulated_text)
+                        
+                        # If we're going to speak, say we're going to stop sending text.
+                        # This should be fixed probably, we should be able to do both in parallel, or only one.
+                        if any(is_full_sentence(sentence) for sentence in sentences):
+                            await to_device.put({"role": "assistant", "type": "message", "end": True})
+                        
+                        if is_full_sentence(sentences[-1]):
+                            for sentence in sentences:
+                                await stream_tts_to_device(sentence)
+                            accumulated_text = ""
+                        else:
+                            for sentence in sentences[:-1]:
+                                await stream_tts_to_device(sentence)
+                            accumulated_text = sentences[-1]
+
+                        # If we're going to speak, say we're going to stop sending text.
+                        # This should be fixed probably, we should be able to do both in parallel, or only one.
+                        if any(is_full_sentence(sentence) for sentence in sentences):
+                            await to_device.put({"role": "assistant", "type": "message", "start": True})
+                    
+                # If we have a new message, save our progress and go back to the top
+                if not from_user.empty():
+
+                    # Check if it's just an end flag. We ignore those.
+                    temp_message = await from_user.get()
+                    
+                    if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
+                        # Yup. False alarm.
+                        continue
                    else:
-                        for sentence in sentences[:-1]:
-                            await stream_tts_to_device(sentence)
-                        accumulated_text = sentences[-1]
-
-                    # If we're going to speak, say we're going to stop sending text.
-                    # This should be fixed probably, we should be able to do both in parallel, or only one.
-                    if any(is_full_sentence(sentence) for sentence in sentences):
-                        await to_device.put({"role": "assistant", "type": "message", "start": True})
-                
-            # If we have a new message, save our progress and go back to the top
-            if not from_user.empty():
+                        # Whoops! Put that back
+                        await from_user.put(temp_message)

-                # Check if it's just an end flag. We ignore those.
-                temp_message = await from_user.get()
+                    with open(conversation_history_path, 'w') as file:
+                        json.dump(interpreter.messages, file, indent=4)

-                if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
-                    # Yup. False alarm.
-                    continue
-                else:
-                    # Whoops! Put that back
-                    await from_user.put(temp_message)
-
-                with open(conversation_history_path, 'w') as file:
-                    json.dump(interpreter.messages, file, indent=4)
-
-                # TODO: is triggering seemingly randomly
-                #logger.info("New user message recieved. Breaking.")
-                #break
+                    # TODO: is triggering seemingly randomly
+                    #logger.info("New user message recieved. Breaking.")
+                    #break

-            # Also check if there's any new computer messages
-            if not from_computer.empty():
+                # Also check if there's any new computer messages
+                if not from_computer.empty():
                    
-                with open(conversation_history_path, 'w') as file:
-                    json.dump(interpreter.messages, file, indent=4)
-
-                logger.info("New computer message recieved. Breaking.")
-                break
+                    with open(conversation_history_path, 'w') as file:
+                        json.dump(interpreter.messages, file, indent=4)

+                    logger.info("New computer message recieved. Breaking.")
+                    break
+        except:
+            traceback.print_exc()

 async def stream_tts_to_device(sentence):
    force_task_completion_responses = [
--- a/01OS/01OS/server/tunnel.py
+++ b/01OS/01OS/server/tunnel.py
@ -71,7 +71,7 @@ def create_tunnel(tunnel_method='ngrok', server_host='localhost', server_port=80

        # If ngrok is installed, start it on the specified port
        # process = subprocess.Popen(f'ngrok http {server_port} --log=stdout', shell=True, stdout=subprocess.PIPE)
-        process = subprocess.Popen(f'ngrok http   {server_port} --scheme http,https  --log=stdout', shell=True, stdout=subprocess.PIPE)
+        process = subprocess.Popen(f'ngrok http {server_port} --scheme http,https  --log=stdout', shell=True, stdout=subprocess.PIPE)

        # Initially, no URL is found
        found_url = False
--- a/01OS/poetry.lock
+++ b/01OS/poetry.lock
--- a/01OS/pyproject.toml
+++ b/01OS/pyproject.toml
@ -24,11 +24,11 @@ pydub = "^0.25.1"
 ngrok = "^1.0.0"
 simpleaudio = "^1.0.4"
 opencv-python = "^4.9.0.80"
-open-interpreter = { git = "https://github.com/KillianLucas/open-interpreter.git", branch = "development" }
 psutil = "^5.9.8"
 typer = "^0.9.0"
 platformdirs = "^4.2.0"
 rich = "^13.7.1"
+open-interpreter = "^0.2.2"

 [build-system]
 requires = ["poetry-core"]