add async-interpreter

7 months ago · 10681b552f
parent c35d4c08f4
commit 10681b552f
6 changed files with 5041 additions and 4127 deletions
--- a/software/poetry.lock
+++ b/software/poetry.lock
--- a/software/pyproject.toml
+++ b/software/pyproject.toml
@ -28,12 +28,23 @@ psutil = "^5.9.8"
 typer = "^0.9.0"
 platformdirs = "^4.2.0"
 rich = "^13.7.1"
-open-interpreter = {extras = ["os"], version = "^0.2.4"}
-dateparser = "^1.2.0"
 pytimeparse = "^1.1.8"
 python-crontab = "^3.0.0"
 inquirer = "^3.2.4"
 pyqrcode = "^1.2.1"
+realtimestt = "^0.1.12"
+realtimetts = "^0.3.44"
+keyboard = "^0.13.5"
+pyautogui = "^0.9.54"
+ctranslate2 = "4.1.0"
+py3-tts = "^3.5"
+elevenlabs = "0.2.27"
+groq = "^0.5.0"
+open-interpreter = "^0.2.5"
+litellm = "1.35.35"
+openai = "1.13.3"
+pywebview = "*"
+pyobjc = "*"

 [build-system]
 requires = ["poetry-core"]
--- a/software/source/clients/base_device.py
+++ b/software/source/clients/base_device.py
@ -60,12 +60,18 @@ CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
 # Specify OS
 current_platform = get_system_info()

+
 def is_win11():
    return sys.getwindowsversion().build >= 22000

+
 def is_win10():
    try:
-        return platform.system() == "Windows" and "10" in platform.version() and not is_win11()
+        return (
+            platform.system() == "Windows"
+            and "10" in platform.version()
+            and not is_win11()
+        )
    except:
        return False

@ -267,19 +273,18 @@ class Device:
    def on_press(self, key):
        """Detect spacebar press and Ctrl+C combination."""
        self.pressed_keys.add(key)  # Add the pressed key to the set
-        

        if keyboard.Key.space in self.pressed_keys:
            self.toggle_recording(True)
-        elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char('c')} <= self.pressed_keys:
+        elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
            logger.info("Ctrl+C pressed. Exiting...")
            kill_process_tree()
            os._exit(0)
-        
+
        # Windows alternative to the above
        if key == keyboard.Key.ctrl_l:
            self.ctrl_pressed = True
-            
+
        try:
            if key.vk == 67 and self.ctrl_pressed:
                logger.info("Ctrl+C pressed. Exiting...")
@ -289,17 +294,17 @@ class Device:
        except:
            pass

-
-
    def on_release(self, key):
        """Detect spacebar release and 'c' key press for camera, and handle key release."""
-        self.pressed_keys.discard(key)  # Remove the released key from the key press tracking set
+        self.pressed_keys.discard(
+            key
+        )  # Remove the released key from the key press tracking set

        if key == keyboard.Key.ctrl_l:
            self.ctrl_pressed = False
        if key == keyboard.Key.space:
            self.toggle_recording(False)
-        elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char('c'):
+        elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
            self.fetch_image_from_camera()

    async def message_sender(self, websocket):
@ -307,14 +312,18 @@ class Device:
            message = await asyncio.get_event_loop().run_in_executor(
                None, send_queue.get
            )
+
            if isinstance(message, bytes):
                await websocket.send(message)
+
            else:
                await websocket.send(json.dumps(message))
+
            send_queue.task_done()
            await asyncio.sleep(0.01)

    async def websocket_communication(self, WS_URL):
+        print("websocket communication was called!!!!")
        show_connection_log = True

        async def exec_ws_communication(websocket):
@ -331,8 +340,8 @@ class Device:
                await asyncio.sleep(0.01)
                chunk = await websocket.recv()

-                logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
-
+                # logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
+                print((f"Got this message from the server: {type(chunk)} {chunk}"))
                if type(chunk) == str:
                    chunk = json.loads(chunk)

@ -369,7 +378,7 @@ class Device:
                        code = message["content"]
                        result = interpreter.computer.run(language, code)
                        send_queue.put(result)
-                        
+
        if is_win10():
            logger.info("Windows 10 detected")
            # Workaround for Windows 10 not latching to the websocket server.
@ -380,20 +389,22 @@ class Device:
            except Exception as e:
                logger.error(f"Error while attempting to connect: {e}")
        else:
+            print("websocket url is", WS_URL)
            while True:
                try:
                    async with websockets.connect(WS_URL) as websocket:
                        await exec_ws_communication(websocket)
                except:
-                    logger.debug(traceback.format_exc())
+                    logger.info(traceback.format_exc())
                    if show_connection_log:
                        logger.info(f"Connecting to `{WS_URL}`...")
                        show_connection_log = False
                        await asyncio.sleep(2)

    async def start_async(self):
+        print("start async was called!!!!!")
        # Configuration for WebSocket
-        WS_URL = f"ws://{self.server_url}"
+        WS_URL = f"ws://{self.server_url}/ws"
        # Start the WebSocket communication
        asyncio.create_task(self.websocket_communication(WS_URL))

@ -430,8 +441,10 @@ class Device:
                on_press=self.on_press, on_release=self.on_release
            )
            listener.start()
+            print("listener for keyboard started!!!!!")

    def start(self):
+        print("device was started!!!!!!")
        if os.getenv("TEACH_MODE") != "True":
            asyncio.run(self.start_async())
            p.terminate()
--- a/software/source/server/ai_server.py
+++ b/software/source/server/ai_server.py
@ -0,0 +1,119 @@
+import asyncio
+import traceback
+import json
+from fastapi import FastAPI, WebSocket, Header
+from uvicorn import Config, Server
+from interpreter import interpreter as base_interpreter
+from .async_interpreter import AsyncInterpreter
+from fastapi.middleware.cors import CORSMiddleware
+from typing import List, Dict, Any
+from openai import OpenAI
+from pydantic import BaseModel
+import argparse
+import os
+
+os.environ["STT_RUNNER"] = "server"
+os.environ["TTS_RUNNER"] = "server"
+
+# Parse command line arguments for port number
+parser = argparse.ArgumentParser(description="FastAPI server.")
+parser.add_argument("--port", type=int, default=8000, help="Port to run on.")
+args = parser.parse_args()
+
+base_interpreter.tts = "openai"
+base_interpreter.llm.model = "gpt-4-turbo"
+
+
+async def main():
+    interpreter = AsyncInterpreter(base_interpreter)
+
+    app = FastAPI()
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],  # Allow all methods (GET, POST, etc.)
+        allow_headers=["*"],  # Allow all headers
+    )
+
+    @app.post("/load_chat")
+    async def load_chat(messages: List[Dict[str, Any]]):
+        interpreter.interpreter.messages = messages
+        interpreter.active_chat_messages = messages
+        print("🪼🪼🪼🪼🪼🪼 Messages loaded: ", interpreter.active_chat_messages)
+        return {"status": "success"}
+
+    @app.websocket("/ws")
+    async def websocket_endpoint(websocket: WebSocket):
+        await websocket.accept()
+        try:
+
+            async def receive_input():
+                while True:
+                    data = await websocket.receive()
+
+                    if isinstance(data, bytes):
+                        await interpreter.input(data)
+                    elif "bytes" in data:
+                        await interpreter.input(data["bytes"])
+                        print("SERVER FEEDING AUDIO")
+                    elif "text" in data:
+                        print("RECEIVED INPUT", data)
+                        await interpreter.input(data["text"])
+
+            async def send_output():
+                while True:
+                    output = await interpreter.output()
+                    if isinstance(output, bytes):
+                        await websocket.send_bytes(output)
+                        # we dont send out bytes rn, no TTS
+                        pass
+                    elif isinstance(output, dict):
+                        await websocket.send_text(json.dumps(output))
+
+            await asyncio.gather(receive_input(), send_output())
+        except Exception as e:
+            print(f"WebSocket connection closed with exception: {e}")
+            traceback.print_exc()
+        finally:
+            await websocket.close()
+
+    config = Config(app, host="0.0.0.0", port=8000, lifespan="on")
+    server = Server(config)
+    await server.serve()
+
+    class Rename(BaseModel):
+        input: str
+
+    @app.post("/rename-chat")
+    async def rename_chat(body_content: Rename, x_api_key: str = Header(None)):
+        print("RENAME CHAT REQUEST in PY 🌙🌙🌙🌙")
+        input_value = body_content.input
+        client = OpenAI(
+            # defaults to os.environ.get("OPENAI_API_KEY")
+            api_key=x_api_key,
+        )
+        try:
+            response = client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": f"Given the following chat snippet, create a unique and descriptive title in less than 8 words. Your answer must not be related to customer service.\n\n{input_value}",
+                    }
+                ],
+                temperature=0.3,
+                stream=False,
+            )
+            print(response)
+            completion = response["choices"][0]["message"]["content"]
+            return {"data": {"content": completion}}
+        except Exception as e:
+            print(f"Error: {e}")
+            traceback.print_exc()
+            return {"error": str(e)}
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/software/source/server/async_interpreter.py
+++ b/software/source/server/async_interpreter.py
@ -0,0 +1,211 @@
+# This is a websocket interpreter, TTS and STT disabled.
+# It makes a websocket on port 8000 that sends/recieves LMC messages in *streaming* format.
+
+### You MUST send a start and end flag with each message! For example: ###
+
+"""
+{"role": "user", "type": "message", "start": True})
+{"role": "user", "type": "message", "content": "hi"})
+{"role": "user", "type": "message", "end": True})
+"""
+
+###
+
+from pynput import keyboard
+from RealtimeTTS import TextToAudioStream, OpenAIEngine, CoquiEngine
+from RealtimeSTT import AudioToTextRecorder
+import time
+import asyncio
+import json
+
+
+class AsyncInterpreter:
+    def __init__(self, interpreter):
+        self.interpreter = interpreter
+
+        # STT
+        self.stt = AudioToTextRecorder(use_microphone=False)
+        self.stt.stop()  # It needs this for some reason
+
+        # TTS
+        if self.interpreter.tts == "coqui":
+            engine = CoquiEngine()
+        elif self.interpreter.tts == "openai":
+            engine = OpenAIEngine()
+        self.tts = TextToAudioStream(engine)
+
+        self.active_chat_messages = []
+
+        self._input_queue = asyncio.Queue()  # Queue that .input will shove things into
+        self._output_queue = asyncio.Queue()  # Queue to put output chunks into
+        self._last_lmc_start_flag = None  # Unix time of last LMC start flag recieved
+        self._in_keyboard_write_block = (
+            False  # Tracks whether interpreter is trying to use the keyboard
+        )
+        self.loop = asyncio.get_event_loop()
+
+    async def _add_to_queue(self, queue, item):
+        await queue.put(item)
+
+    async def clear_queue(self, queue):
+        while not queue.empty():
+            await queue.get()
+
+    async def clear_input_queue(self):
+        await self.clear_queue(self._input_queue)
+
+    async def clear_output_queue(self):
+        await self.clear_queue(self._output_queue)
+
+    async def input(self, chunk):
+        """
+        Expects a chunk in streaming LMC format.
+        """
+        if isinstance(chunk, bytes):
+            # It's probably a chunk of audio
+            self.stt.feed_audio(chunk)
+            print("INTERPRETER FEEDING AUDIO")
+
+        else:
+
+            try:
+                chunk = json.loads(chunk)
+            except:
+                pass
+
+            if "start" in chunk:
+                print("input received")
+                self.stt.start()
+                self._last_lmc_start_flag = time.time()
+                # self.interpreter.computer.terminal.stop() # Stop any code execution... maybe we should make interpreter.stop()?
+            elif "end" in chunk:
+                print("running oi on input now")
+                asyncio.create_task(self.run())
+            else:
+                await self._add_to_queue(self._input_queue, chunk)
+
+    def add_to_output_queue_sync(self, chunk):
+        """
+        Synchronous function to add a chunk to the output queue.
+        """
+        print("ADDING TO QUEUE:", chunk)
+        asyncio.create_task(self._add_to_queue(self._output_queue, chunk))
+
+    async def run(self):
+        """
+        Runs OI on the audio bytes submitted to the input. Will add streaming LMC chunks to the _output_queue.
+        """
+        self.interpreter.messages = self.active_chat_messages
+
+        # self.beeper.start()
+
+        self.stt.stop()
+        # message = self.stt.text()
+        # print("THE MESSAGE:", message)
+
+        # accumulates the input queue message
+        input_queue = []
+        while not self._input_queue.empty():
+            input_queue.append(self._input_queue.get())
+
+        print("INPUT QUEUE:", input_queue)
+        # message = [i for i in input_queue if i["type"] == "message"][0]["content"]
+        # message = self.stt.text()
+
+        message = "hello"
+        print(message)
+
+        # print(message)
+        def generate(message):
+            last_lmc_start_flag = self._last_lmc_start_flag
+            self.interpreter.messages = self.active_chat_messages
+            print(
+                "🍀🍀🍀🍀GENERATING, using these messages: ", self.interpreter.messages
+            )
+            print("🍀   🍀   🍀   🍀 active_chat_messages: ", self.active_chat_messages)
+            print("message is", message)
+
+            for chunk in self.interpreter.chat(message, display=True, stream=True):
+
+                if self._last_lmc_start_flag != last_lmc_start_flag:
+                    # self.beeper.stop()
+                    break
+
+                # self.add_to_output_queue_sync(chunk) # To send text, not just audio
+
+                content = chunk.get("content")
+
+                # Handle message blocks
+                if chunk.get("type") == "message":
+                    if content:
+                        # self.beeper.stop()
+
+                        # Experimental: The AI voice sounds better with replacements like these, but it should happen at the TTS layer
+                        # content = content.replace(". ", ". ... ").replace(", ", ", ... ").replace("!", "! ... ").replace("?", "? ... ")
+
+                        yield content
+
+                # Handle code blocks
+                elif chunk.get("type") == "code":
+                    if "start" in chunk:
+                        # self.beeper.start()
+                        pass
+
+                    # Experimental: If the AI wants to type, we should type immediatly
+                    if (
+                        self.interpreter.messages[-1]
+                        .get("content", "")
+                        .startswith("computer.keyboard.write(")
+                    ):
+                        keyboard.controller.type(content)
+                        self._in_keyboard_write_block = True
+                    if "end" in chunk and self._in_keyboard_write_block:
+                        self._in_keyboard_write_block = False
+                        # (This will make it so it doesn't type twice when the block executes)
+                        if self.interpreter.messages[-1]["content"].startswith(
+                            "computer.keyboard.write("
+                        ):
+                            self.interpreter.messages[-1]["content"] = (
+                                "dummy_variable = ("
+                                + self.interpreter.messages[-1]["content"][
+                                    len("computer.keyboard.write(") :
+                                ]
+                            )
+
+            # Send a completion signal
+
+            # self.add_to_output_queue_sync({"role": "server","type": "completion", "content": "DONE"})
+
+        # Feed generate to RealtimeTTS
+        self.add_to_output_queue_sync(
+            {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
+        )
+        self.tts.feed(generate(message))
+        self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True)
+        while True:
+            if self.tts.is_playing():
+                break
+            await asyncio.sleep(0.1)
+        while True:
+            await asyncio.sleep(0.1)
+            print("is_playing", self.tts.is_playing())
+            if not self.tts.is_playing():
+                self.add_to_output_queue_sync(
+                    {
+                        "role": "assistant",
+                        "type": "audio",
+                        "format": "bytes.wav",
+                        "end": True,
+                    }
+                )
+                break
+
+    async def _on_tts_chunk_async(self, chunk):
+        print("SENDING TTS CHUNK")
+        await self._add_to_queue(self._output_queue, chunk)
+
+    def on_tts_chunk(self, chunk):
+        asyncio.run(self._on_tts_chunk_async(chunk))
+
+    async def output(self):
+        return await self._output_queue.get()
--- a/software/start.py
+++ b/software/start.py
@ -5,7 +5,7 @@ import threading
 import os
 import importlib
 from source.server.tunnel import create_tunnel
-from source.server.server import main
+from source.server.ai_server import main
 from source.server.utils.local_mode import select_local_model

 import signal
@ -22,7 +22,7 @@ def run(
        help="Specify the server host where the server will deploy",
    ),
    server_port: int = typer.Option(
-        10001,
+        8000,
        "--server-port",
        help="Specify the server port where the server will deploy",
    ),
@ -103,7 +103,7 @@ def run(
 def _run(
    server: bool = False,
    server_host: str = "0.0.0.0",
-    server_port: int = 10001,
+    server_port: int = 8000,
    tunnel_service: str = "bore",
    expose: bool = False,
    client: bool = False,
@ -127,7 +127,7 @@ def _run(
        # llm_service = "llamafile"
        stt_service = "local-whisper"
        select_local_model()
-        
+
    system_type = platform.system()
    if system_type == "Windows":
        server_host = "localhost"
@ -138,8 +138,6 @@ def _run(
    if not server and not client:
        server = True
        client = True
-        
-    

    def handle_exit(signum, frame):
        os._exit(0)
@ -154,18 +152,18 @@ def _run(
            target=loop.run_until_complete,
            args=(
                main(
-                    server_host,
-                    server_port,
-                    llm_service,
-                    model,
-                    llm_supports_vision,
-                    llm_supports_functions,
-                    context_window,
-                    max_tokens,
-                    temperature,
-                    tts_service,
-                    stt_service,
-                    mobile,
+                    # server_host,
+                    # server_port,
+                    # llm_service,
+                    # model,
+                    # llm_supports_vision,
+                    # llm_supports_functions,
+                    # context_window,
+                    # max_tokens,
+                    # temperature,
+                    # tts_service,
+                    # stt_service,
+                    # mobile,
                ),
            ),
        )
@ -182,6 +180,7 @@ def _run(
            system_type = platform.system()
            if system_type == "Darwin":  # Mac OS
                client_type = "mac"
+                print("initiating mac device with base device!!!")
            elif system_type == "Windows":  # Windows System
                client_type = "windows"
            elif system_type == "Linux":  # Linux System
@ -197,7 +196,9 @@ def _run(
        module = importlib.import_module(
            f".clients.{client_type}.device", package="source"
        )
+        server_url = "0.0.0.0:8000"
        client_thread = threading.Thread(target=module.main, args=[server_url])
+        print("client thread started")
        client_thread.start()

    try: