Use official OI server. 3 second latency.

11 months ago · 4640b4f1a0
parent fef311e5b3
commit 4640b4f1a0
12 changed files with 4676 additions and 2179 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,7 @@ __pycache__/
 software/source/server/conversations/user.json
 software/source/server/tts/local_service/*
 software/source/server/stt/local_service/*
+software/models/*

 # C extensions
 *.so
--- a/software/poetry.lock
+++ b/software/poetry.lock
--- a/software/pyproject.toml
+++ b/software/pyproject.toml
@ -14,8 +14,6 @@ readme = "../README.md"
 python = ">=3.9,<3.12"
 pyaudio = "^0.2.14"
 pynput = "^1.7.6"
-fastapi = "^0.110.0"
-uvicorn = "^0.27.1"
 websockets = "^12.0"
 python-dotenv = "^1.0.1"
 ffmpeg-python = "^0.2.0"
@ -25,7 +23,6 @@ ngrok = "^1.0.0"
 simpleaudio = "^1.0.4"
 opencv-python = "^4.9.0.80"
 psutil = "^5.9.8"
-typer = "^0.9.0"
 platformdirs = "^4.2.0"
 rich = "^13.7.1"
 pytimeparse = "^1.1.8"
@ -33,23 +30,25 @@ python-crontab = "^3.0.0"
 inquirer = "^3.2.4"
 pyqrcode = "^1.2.1"
 realtimestt = "^0.1.16"
-realtimetts = "^0.4.1"
+realtimetts = { version = "^0.4.2", extras = ["all"] }
 keyboard = "^0.13.5"
 pyautogui = "^0.9.54"
 ctranslate2 = "4.1.0"
-py3-tts = "^3.5"
-elevenlabs = "1.2.2"
+#py3-tts = "^3.5"
+#elevenlabs = "1.2.2"
 groq = "^0.5.0"
-open-interpreter = {git = "https://github.com/OpenInterpreter/open-interpreter.git", extras = ["os"]}
+open-interpreter = {git = "https://github.com/OpenInterpreter/open-interpreter.git", branch = "development", extras = ["os", "server"]}
 litellm = "*"
 openai = "*"
 pywebview = "*"
 pyobjc = "*"
-
 sentry-sdk = "^2.4.0"
 plyer = "^2.1.0"
 pywinctl = "^0.3"
 certifi = "^2024.7.4"
+pygame = "^2.6.0"
+mpv = "^1.0.7"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
--- a/software/source/clients/archive_base_device.py
+++ b/software/source/clients/archive_base_device.py
@ -0,0 +1,482 @@
+from dotenv import load_dotenv
+
+load_dotenv()  # take environment variables from .env.
+
+import subprocess
+import os
+import sys
+import asyncio
+import threading
+import pyaudio
+from pynput import keyboard
+import json
+import traceback
+import websockets
+import queue
+from pydub import AudioSegment
+from pydub.playback import play
+import time
+import wave
+import tempfile
+from datetime import datetime
+import cv2
+import base64
+import platform
+from interpreter import (
+    interpreter,
+)  # Just for code execution. Maybe we should let people do from interpreter.computer import run?
+
+# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
+from ..server.utils.kernel import put_kernel_messages_into_queue
+from ..server.utils.get_system_info import get_system_info
+from ..server.utils.process_utils import kill_process_tree
+
+from ..server.utils.logs import setup_logging
+from ..server.utils.logs import logger
+
+setup_logging()
+
+os.environ["STT_RUNNER"] = "server"
+os.environ["TTS_RUNNER"] = "server"
+
+from ..utils.accumulator import Accumulator
+
+accumulator = Accumulator()
+
+# Configuration for Audio Recording
+CHUNK = 1024  # Record in chunks of 1024 samples
+FORMAT = pyaudio.paInt16  # 16 bits per sample
+CHANNELS = 1  # Mono
+RATE = 16000  # Sample rate
+RECORDING = False  # Flag to control recording state
+SPACEBAR_PRESSED = False  # Flag to track spacebar press state
+
+# Camera configuration
+CAMERA_ENABLED = os.getenv("CAMERA_ENABLED", False)
+if type(CAMERA_ENABLED) == str:
+    CAMERA_ENABLED = CAMERA_ENABLED.lower() == "true"
+CAMERA_DEVICE_INDEX = int(os.getenv("CAMERA_DEVICE_INDEX", 0))
+CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
+
+# Specify OS
+current_platform = get_system_info()
+
+
+def is_win11():
+    return sys.getwindowsversion().build >= 22000
+
+
+def is_win10():
+    try:
+        return (
+            platform.system() == "Windows"
+            and "10" in platform.version()
+            and not is_win11()
+        )
+    except:
+        return False
+
+
+# Initialize PyAudio
+p = pyaudio.PyAudio()
+
+send_queue = queue.Queue()
+
+
+class Device:
+    def __init__(self):
+        self.pressed_keys = set()
+        self.captured_images = []
+        self.audiosegments = asyncio.Queue()
+        self.server_url = ""
+        self.ctrl_pressed = False
+        self.tts_service = ""
+        self.debug = False
+        self.playback_latency = None
+
+    def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
+        """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
+        image_path = None
+
+        cap = cv2.VideoCapture(camera_index)
+        ret, frame = cap.read()  # Capture a single frame to initialize the camera
+
+        if CAMERA_WARMUP_SECONDS > 0:
+            # Allow camera to warm up, then snap a picture again
+            # This is a workaround for some cameras that don't return a properly exposed
+            # picture immediately when they are first turned on
+            time.sleep(CAMERA_WARMUP_SECONDS)
+            ret, frame = cap.read()
+
+        if ret:
+            temp_dir = tempfile.gettempdir()
+            image_path = os.path.join(
+                temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png"
+            )
+            self.captured_images.append(image_path)
+            cv2.imwrite(image_path, frame)
+            logger.info(f"Camera image captured to {image_path}")
+            logger.info(
+                f"You now have {len(self.captured_images)} images which will be sent along with your next audio message."
+            )
+        else:
+            logger.error(
+                f"Error: Couldn't capture an image from camera ({camera_index})"
+            )
+
+        cap.release()
+
+        return image_path
+
+    def encode_image_to_base64(self, image_path):
+        """Encodes an image file to a base64 string."""
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    def add_image_to_send_queue(self, image_path):
+        """Encodes an image and adds an LMC message to the send queue with the image data."""
+        base64_image = self.encode_image_to_base64(image_path)
+        image_message = {
+            "role": "user",
+            "type": "image",
+            "format": "base64.png",
+            "content": base64_image,
+        }
+        send_queue.put(image_message)
+        # Delete the image file from the file system after sending it
+        os.remove(image_path)
+
+    def queue_all_captured_images(self):
+        """Queues all captured images to be sent."""
+        for image_path in self.captured_images:
+            self.add_image_to_send_queue(image_path)
+        self.captured_images.clear()  # Clear the list after sending
+
+    async def play_audiosegments(self):
+        """Plays them sequentially."""
+
+        if self.tts_service == "elevenlabs":
+            print("Ensure `mpv` in installed to use `elevenlabs`.\n\n(On macOSX, you can run `brew install mpv`.)")
+            mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
+            mpv_process = subprocess.Popen(
+                mpv_command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+
+        while True:
+            try:
+                audio = await self.audiosegments.get()
+                if self.debug and self.playback_latency and isinstance(audio, bytes):
+                    elapsed_time = time.time() - self.playback_latency
+                    print(f"Time from request to playback: {elapsed_time} seconds")
+                    self.playback_latency = None
+
+                if self.tts_service == "elevenlabs":
+                    mpv_process.stdin.write(audio)  # type: ignore
+                    mpv_process.stdin.flush()  # type: ignore
+                else:
+                    play(audio)
+
+                await asyncio.sleep(0.1)
+            except asyncio.exceptions.CancelledError:
+                # This happens once at the start?
+                pass
+            except:
+                logger.info(traceback.format_exc())
+
+    def record_audio(self):
+        if os.getenv("STT_RUNNER") == "server":
+            # STT will happen on the server. we're sending audio.
+            send_queue.put(
+                {"role": "user", "type": "audio", "format": "bytes.wav", "start": True}
+            )
+        elif os.getenv("STT_RUNNER") == "client":
+            # STT will happen here, on the client. we're sending text.
+            send_queue.put({"role": "user", "type": "message", "start": True})
+        else:
+            raise Exception("STT_RUNNER must be set to either 'client' or 'server'.")
+
+        """Record audio from the microphone and add it to the queue."""
+        stream = p.open(
+            format=FORMAT,
+            channels=CHANNELS,
+            rate=RATE,
+            input=True,
+            frames_per_buffer=CHUNK,
+        )
+        print("Recording started...")
+        global RECORDING
+
+        # Create a temporary WAV file to store the audio data
+        temp_dir = tempfile.gettempdir()
+        wav_path = os.path.join(
+            temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
+        )
+        wav_file = wave.open(wav_path, "wb")
+        wav_file.setnchannels(CHANNELS)
+        wav_file.setsampwidth(p.get_sample_size(FORMAT))
+        wav_file.setframerate(RATE)
+
+        while RECORDING:
+            data = stream.read(CHUNK, exception_on_overflow=False)
+            wav_file.writeframes(data)
+
+        wav_file.close()
+        stream.stop_stream()
+        stream.close()
+        print("Recording stopped.")
+        if self.debug:
+            self.playback_latency = time.time()
+
+        duration = wav_file.getnframes() / RATE
+        if duration < 0.3:
+            # Just pressed it. Send stop message
+            if os.getenv("STT_RUNNER") == "client":
+                send_queue.put({"role": "user", "type": "message", "content": "stop"})
+                send_queue.put({"role": "user", "type": "message", "end": True})
+            else:
+                send_queue.put(
+                    {
+                        "role": "user",
+                        "type": "audio",
+                        "format": "bytes.wav",
+                        "content": "",
+                    }
+                )
+                send_queue.put(
+                    {
+                        "role": "user",
+                        "type": "audio",
+                        "format": "bytes.wav",
+                        "end": True,
+                    }
+                )
+        else:
+            self.queue_all_captured_images()
+
+            if os.getenv("STT_RUNNER") == "client":
+                # THIS DOES NOT WORK. We moved to this very cool stt_service, llm_service
+                # way of doing things. stt_wav is not a thing anymore. Needs work to work
+
+                # Run stt then send text
+                text = stt_wav(wav_path)
+                logger.debug(f"STT result: {text}")
+                send_queue.put({"role": "user", "type": "message", "content": text})
+                send_queue.put({"role": "user", "type": "message", "end": True})
+            else:
+                # Stream audio
+                with open(wav_path, "rb") as audio_file:
+                    byte_data = audio_file.read(CHUNK)
+                    while byte_data:
+                        send_queue.put(byte_data)
+                        byte_data = audio_file.read(CHUNK)
+                send_queue.put(
+                    {
+                        "role": "user",
+                        "type": "audio",
+                        "format": "bytes.wav",
+                        "end": True,
+                    }
+                )
+
+        if os.path.exists(wav_path):
+            os.remove(wav_path)
+
+    def toggle_recording(self, state):
+        """Toggle the recording state."""
+        global RECORDING, SPACEBAR_PRESSED
+        if state and not SPACEBAR_PRESSED:
+            SPACEBAR_PRESSED = True
+            if not RECORDING:
+                RECORDING = True
+                threading.Thread(target=self.record_audio).start()
+        elif not state and SPACEBAR_PRESSED:
+            SPACEBAR_PRESSED = False
+            RECORDING = False
+
+    def on_press(self, key):
+        """Detect spacebar press and Ctrl+C combination."""
+        self.pressed_keys.add(key)  # Add the pressed key to the set
+
+        if keyboard.Key.space in self.pressed_keys:
+            self.toggle_recording(True)
+        elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
+            logger.info("Ctrl+C pressed. Exiting...")
+            kill_process_tree()
+            os._exit(0)
+
+        # Windows alternative to the above
+        if key == keyboard.Key.ctrl_l:
+            self.ctrl_pressed = True
+
+        try:
+            if key.vk == 67 and self.ctrl_pressed:
+                logger.info("Ctrl+C pressed. Exiting...")
+                kill_process_tree()
+                os._exit(0)
+        # For non-character keys
+        except:
+            pass
+
+    def on_release(self, key):
+        """Detect spacebar release and 'c' key press for camera, and handle key release."""
+        self.pressed_keys.discard(
+            key
+        )  # Remove the released key from the key press tracking set
+
+        if key == keyboard.Key.ctrl_l:
+            self.ctrl_pressed = False
+        if key == keyboard.Key.space:
+            self.toggle_recording(False)
+        elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
+            self.fetch_image_from_camera()
+
+    async def message_sender(self, websocket):
+        while True:
+            message = await asyncio.get_event_loop().run_in_executor(
+                None, send_queue.get
+            )
+            if isinstance(message, bytes):
+                await websocket.send(message)
+            else:
+                await websocket.send(json.dumps(message))
+            send_queue.task_done()
+            await asyncio.sleep(0.01)
+
+    async def websocket_communication(self, WS_URL):
+        show_connection_log = True
+
+        async def exec_ws_communication(websocket):
+            if CAMERA_ENABLED:
+                print(
+                    "\nHold the spacebar to start recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit."
+                )
+            else:
+                print("\nHold the spacebar to start recording. Press CTRL-C to exit.")
+
+            asyncio.create_task(self.message_sender(websocket))
+
+            while True:
+                await asyncio.sleep(0.01)
+                chunk = await websocket.recv()
+
+                logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
+                # print("received chunk from server")
+
+                if type(chunk) == str:
+                    chunk = json.loads(chunk)
+
+                    if chunk.get("type") == "config":
+                        self.tts_service = chunk.get("tts_service")
+                        continue
+
+                if self.tts_service == "elevenlabs":
+                    message = chunk
+                else:
+                    message = accumulator.accumulate(chunk)
+
+                if message == None:
+                    # Will be None until we have a full message ready
+                    continue
+
+                # At this point, we have our message
+                if isinstance(message, bytes) or (
+                    message["type"] == "audio" and message["format"].startswith("bytes")
+                ):
+                    # Convert bytes to audio file
+                    if self.tts_service == "elevenlabs":
+                        audio_bytes = message
+                        audio = audio_bytes
+                    else:
+                        audio_bytes = message["content"]
+
+                        # Create an AudioSegment instance with the raw data
+                        audio = AudioSegment(
+                            # raw audio data (bytes)
+                            data=audio_bytes,
+                            # signed 16-bit little-endian format
+                            sample_width=2,
+                            # 16,000 Hz frame rate
+                            frame_rate=22050,
+                            # mono sound
+                            channels=1,
+                        )
+
+                    await self.audiosegments.put(audio)
+
+                # Run the code if that's the client's job
+                if os.getenv("CODE_RUNNER") == "client":
+                    if message["type"] == "code" and "end" in message:
+                        language = message["format"]
+                        code = message["content"]
+                        result = interpreter.computer.run(language, code)
+                        send_queue.put(result)
+
+        if is_win10():
+            logger.info("Windows 10 detected")
+            # Workaround for Windows 10 not latching to the websocket server.
+            # See https://github.com/OpenInterpreter/01/issues/197
+            try:
+                ws = websockets.connect(WS_URL)
+                await exec_ws_communication(ws)
+            except Exception as e:
+                logger.error(f"Error while attempting to connect: {e}")
+        else:
+            while True:
+                try:
+                    async with websockets.connect(WS_URL) as websocket:
+                        await exec_ws_communication(websocket)
+                except:
+                    logger.debug(traceback.format_exc())
+                    if show_connection_log:
+                        logger.info(f"Connecting to `{WS_URL}`...")
+                        show_connection_log = False
+                        await asyncio.sleep(2)
+
+    async def start_async(self):
+        # Configuration for WebSocket
+        WS_URL = f"ws://{self.server_url}"
+        # Start the WebSocket communication
+        asyncio.create_task(self.websocket_communication(WS_URL))
+
+        # Start watching the kernel if it's your job to do that
+        if os.getenv("CODE_RUNNER") == "client":
+            # client is not running code!
+            asyncio.create_task(put_kernel_messages_into_queue(send_queue))
+
+        asyncio.create_task(self.play_audiosegments())
+
+        # If Raspberry Pi, add the button listener, otherwise use the spacebar
+        if current_platform.startswith("raspberry-pi"):
+            logger.info("Raspberry Pi detected, using button on GPIO pin 15")
+            # Use GPIO pin 15
+            pindef = ["gpiochip4", "15"]  # gpiofind PIN15
+            print("PINDEF", pindef)
+
+            # HACK: needs passwordless sudo
+            process = await asyncio.create_subprocess_exec(
+                "sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE
+            )
+            while True:
+                line = await process.stdout.readline()
+                if line:
+                    line = line.decode().strip()
+                    if "FALLING" in line:
+                        self.toggle_recording(False)
+                    elif "RISING" in line:
+                        self.toggle_recording(True)
+                else:
+                    break
+        else:
+            # Keyboard listener for spacebar press/release
+            listener = keyboard.Listener(
+                on_press=self.on_press, on_release=self.on_release
+            )
+            listener.start()
+
+    def start(self):
+        if os.getenv("TEACH_MODE") != "True":
+            asyncio.run(self.start_async())
+            p.terminate()
--- a/software/source/clients/base_device.py
+++ b/software/source/clients/base_device.py
@ -1,482 +1,88 @@
-from dotenv import load_dotenv
-
-load_dotenv()  # take environment variables from .env.
-
-import subprocess
-import os
-import sys
 import asyncio
-import threading
+import websockets
 import pyaudio
 from pynput import keyboard
 import json
-import traceback
-import websockets
-import queue
-from pydub import AudioSegment
-from pydub.playback import play
-import time
-import wave
-import tempfile
-from datetime import datetime
-import cv2
-import base64
-import platform
-from interpreter import (
-    interpreter,
-)  # Just for code execution. Maybe we should let people do from interpreter.computer import run?
-
-# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
-from ..server.utils.kernel import put_kernel_messages_into_queue
-from ..server.utils.get_system_info import get_system_info
-from ..server.utils.process_utils import kill_process_tree
-
-from ..server.utils.logs import setup_logging
-from ..server.utils.logs import logger
-
-setup_logging()
-
-os.environ["STT_RUNNER"] = "server"
-os.environ["TTS_RUNNER"] = "server"
-
-from ..utils.accumulator import Accumulator
-
-accumulator = Accumulator()
-
-# Configuration for Audio Recording
-CHUNK = 1024  # Record in chunks of 1024 samples
-FORMAT = pyaudio.paInt16  # 16 bits per sample
-CHANNELS = 1  # Mono
-RATE = 16000  # Sample rate
-RECORDING = False  # Flag to control recording state
-SPACEBAR_PRESSED = False  # Flag to track spacebar press state
-
-# Camera configuration
-CAMERA_ENABLED = os.getenv("CAMERA_ENABLED", False)
-if type(CAMERA_ENABLED) == str:
-    CAMERA_ENABLED = CAMERA_ENABLED.lower() == "true"
-CAMERA_DEVICE_INDEX = int(os.getenv("CAMERA_DEVICE_INDEX", 0))
-CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
-
-# Specify OS
-current_platform = get_system_info()
-
-
-def is_win11():
-    return sys.getwindowsversion().build >= 22000
-
-
-def is_win10():
-    try:
-        return (
-            platform.system() == "Windows"
-            and "10" in platform.version()
-            and not is_win11()
-        )
-    except:
-        return False
-
-
-# Initialize PyAudio
-p = pyaudio.PyAudio()
-
-send_queue = queue.Queue()

+CHUNK = 1024
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RECORDING_RATE = 16000
+PLAYBACK_RATE = 24000

 class Device:
    def __init__(self):
-        self.pressed_keys = set()
-        self.captured_images = []
-        self.audiosegments = asyncio.Queue()
-        self.server_url = ""
-        self.ctrl_pressed = False
-        self.tts_service = ""
-        self.debug = False
-        self.playback_latency = None
-
-    def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
-        """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
-        image_path = None
-
-        cap = cv2.VideoCapture(camera_index)
-        ret, frame = cap.read()  # Capture a single frame to initialize the camera
-
-        if CAMERA_WARMUP_SECONDS > 0:
-            # Allow camera to warm up, then snap a picture again
-            # This is a workaround for some cameras that don't return a properly exposed
-            # picture immediately when they are first turned on
-            time.sleep(CAMERA_WARMUP_SECONDS)
-            ret, frame = cap.read()
-
-        if ret:
-            temp_dir = tempfile.gettempdir()
-            image_path = os.path.join(
-                temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png"
-            )
-            self.captured_images.append(image_path)
-            cv2.imwrite(image_path, frame)
-            logger.info(f"Camera image captured to {image_path}")
-            logger.info(
-                f"You now have {len(self.captured_images)} images which will be sent along with your next audio message."
-            )
-        else:
-            logger.error(
-                f"Error: Couldn't capture an image from camera ({camera_index})"
-            )
-
-        cap.release()
-
-        return image_path
-
-    def encode_image_to_base64(self, image_path):
-        """Encodes an image file to a base64 string."""
-        with open(image_path, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode("utf-8")
-
-    def add_image_to_send_queue(self, image_path):
-        """Encodes an image and adds an LMC message to the send queue with the image data."""
-        base64_image = self.encode_image_to_base64(image_path)
-        image_message = {
-            "role": "user",
-            "type": "image",
-            "format": "base64.png",
-            "content": base64_image,
-        }
-        send_queue.put(image_message)
-        # Delete the image file from the file system after sending it
-        os.remove(image_path)
-
-    def queue_all_captured_images(self):
-        """Queues all captured images to be sent."""
-        for image_path in self.captured_images:
-            self.add_image_to_send_queue(image_path)
-        self.captured_images.clear()  # Clear the list after sending
-
-    async def play_audiosegments(self):
-        """Plays them sequentially."""
-
-        if self.tts_service == "elevenlabs":
-            print("Ensure `mpv` in installed to use `elevenlabs`.\n\n(On macOSX, you can run `brew install mpv`.)")
-            mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
-            mpv_process = subprocess.Popen(
-                mpv_command,
-                stdin=subprocess.PIPE,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-            )
-
-        while True:
+        self.server_url = "0.0.0.0:10001"
+        self.p = pyaudio.PyAudio()
+        self.websocket = None
+        self.recording = False
+        self.input_stream = None
+        self.output_stream = None
+
+    async def connect_with_retry(self, max_retries=50, retry_delay=2):
+        for attempt in range(max_retries):
            try:
-                audio = await self.audiosegments.get()
-                if self.debug and self.playback_latency and isinstance(audio, bytes):
-                    elapsed_time = time.time() - self.playback_latency
-                    print(f"Time from request to playback: {elapsed_time} seconds")
-                    self.playback_latency = None
-
-                if self.tts_service == "elevenlabs":
-                    mpv_process.stdin.write(audio)  # type: ignore
-                    mpv_process.stdin.flush()  # type: ignore
-                else:
-                    play(audio)
-
-                await asyncio.sleep(0.1)
-            except asyncio.exceptions.CancelledError:
-                # This happens once at the start?
-                pass
-            except:
-                logger.info(traceback.format_exc())
-
-    def record_audio(self):
-        if os.getenv("STT_RUNNER") == "server":
-            # STT will happen on the server. we're sending audio.
-            send_queue.put(
-                {"role": "user", "type": "audio", "format": "bytes.wav", "start": True}
-            )
-        elif os.getenv("STT_RUNNER") == "client":
-            # STT will happen here, on the client. we're sending text.
-            send_queue.put({"role": "user", "type": "message", "start": True})
-        else:
-            raise Exception("STT_RUNNER must be set to either 'client' or 'server'.")
-
-        """Record audio from the microphone and add it to the queue."""
-        stream = p.open(
-            format=FORMAT,
-            channels=CHANNELS,
-            rate=RATE,
-            input=True,
-            frames_per_buffer=CHUNK,
-        )
-        print("Recording started...")
-        global RECORDING
-
-        # Create a temporary WAV file to store the audio data
-        temp_dir = tempfile.gettempdir()
-        wav_path = os.path.join(
-            temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
-        )
-        wav_file = wave.open(wav_path, "wb")
-        wav_file.setnchannels(CHANNELS)
-        wav_file.setsampwidth(p.get_sample_size(FORMAT))
-        wav_file.setframerate(RATE)
-
-        while RECORDING:
-            data = stream.read(CHUNK, exception_on_overflow=False)
-            wav_file.writeframes(data)
-
-        wav_file.close()
-        stream.stop_stream()
-        stream.close()
-        print("Recording stopped.")
-        if self.debug:
-            self.playback_latency = time.time()
-
-        duration = wav_file.getnframes() / RATE
-        if duration < 0.3:
-            # Just pressed it. Send stop message
-            if os.getenv("STT_RUNNER") == "client":
-                send_queue.put({"role": "user", "type": "message", "content": "stop"})
-                send_queue.put({"role": "user", "type": "message", "end": True})
-            else:
-                send_queue.put(
-                    {
-                        "role": "user",
-                        "type": "audio",
-                        "format": "bytes.wav",
-                        "content": "",
-                    }
-                )
-                send_queue.put(
-                    {
-                        "role": "user",
-                        "type": "audio",
-                        "format": "bytes.wav",
-                        "end": True,
-                    }
-                )
-        else:
-            self.queue_all_captured_images()
-
-            if os.getenv("STT_RUNNER") == "client":
-                # THIS DOES NOT WORK. We moved to this very cool stt_service, llm_service
-                # way of doing things. stt_wav is not a thing anymore. Needs work to work
-
-                # Run stt then send text
-                text = stt_wav(wav_path)
-                logger.debug(f"STT result: {text}")
-                send_queue.put({"role": "user", "type": "message", "content": text})
-                send_queue.put({"role": "user", "type": "message", "end": True})
-            else:
-                # Stream audio
-                with open(wav_path, "rb") as audio_file:
-                    byte_data = audio_file.read(CHUNK)
-                    while byte_data:
-                        send_queue.put(byte_data)
-                        byte_data = audio_file.read(CHUNK)
-                send_queue.put(
-                    {
-                        "role": "user",
-                        "type": "audio",
-                        "format": "bytes.wav",
-                        "end": True,
-                    }
-                )
-
-        if os.path.exists(wav_path):
-            os.remove(wav_path)
-
-    def toggle_recording(self, state):
-        """Toggle the recording state."""
-        global RECORDING, SPACEBAR_PRESSED
-        if state and not SPACEBAR_PRESSED:
-            SPACEBAR_PRESSED = True
-            if not RECORDING:
-                RECORDING = True
-                threading.Thread(target=self.record_audio).start()
-        elif not state and SPACEBAR_PRESSED:
-            SPACEBAR_PRESSED = False
-            RECORDING = False
-
-    def on_press(self, key):
-        """Detect spacebar press and Ctrl+C combination."""
-        self.pressed_keys.add(key)  # Add the pressed key to the set
-
-        if keyboard.Key.space in self.pressed_keys:
-            self.toggle_recording(True)
-        elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
-            logger.info("Ctrl+C pressed. Exiting...")
-            kill_process_tree()
-            os._exit(0)
-
-        # Windows alternative to the above
-        if key == keyboard.Key.ctrl_l:
-            self.ctrl_pressed = True
-
-        try:
-            if key.vk == 67 and self.ctrl_pressed:
-                logger.info("Ctrl+C pressed. Exiting...")
-                kill_process_tree()
-                os._exit(0)
-        # For non-character keys
-        except:
-            pass
-
-    def on_release(self, key):
-        """Detect spacebar release and 'c' key press for camera, and handle key release."""
-        self.pressed_keys.discard(
-            key
-        )  # Remove the released key from the key press tracking set
-
-        if key == keyboard.Key.ctrl_l:
-            self.ctrl_pressed = False
-        if key == keyboard.Key.space:
-            self.toggle_recording(False)
-        elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
-            self.fetch_image_from_camera()
-
-    async def message_sender(self, websocket):
+                self.websocket = await websockets.connect(f"ws://{self.server_url}")
+                print("Connected to server.")
+                return
+            except ConnectionRefusedError:
+                print(f"Waiting for the server to be ready. Retrying in {retry_delay} seconds...")
+                await asyncio.sleep(retry_delay)
+        raise Exception("Failed to connect to the server after multiple attempts")
+
+    async def send_audio(self):
+        self.input_stream = self.p.open(format=FORMAT, channels=CHANNELS, rate=RECORDING_RATE, input=True, frames_per_buffer=CHUNK)
        while True:
-            message = await asyncio.get_event_loop().run_in_executor(
-                None, send_queue.get
-            )
-            if isinstance(message, bytes):
-                await websocket.send(message)
-            else:
-                await websocket.send(json.dumps(message))
-            send_queue.task_done()
+            if self.recording:
+                try:
+                    # Send start flag
+                    await self.websocket.send(json.dumps({"role": "user", "type": "audio", "format": "bytes.wav", "start": True}))
+                    print("Sending audio start message")
+                    
+                    while self.recording:
+                        data = self.input_stream.read(CHUNK, exception_on_overflow=False)
+                        await self.websocket.send(data)
+                    
+                    # Send stop flag
+                    await self.websocket.send(json.dumps({"role": "user", "type": "audio", "format": "bytes.wav", "end": True}))
+                    print("Sending audio end message")
+                except Exception as e:
+                    print(f"Error in send_audio: {e}")
            await asyncio.sleep(0.01)

-    async def websocket_communication(self, WS_URL):
-        show_connection_log = True
-
-        async def exec_ws_communication(websocket):
-            if CAMERA_ENABLED:
-                print(
-                    "\nHold the spacebar to start recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit."
-                )
-            else:
-                print("\nHold the spacebar to start recording. Press CTRL-C to exit.")
-
-            asyncio.create_task(self.message_sender(websocket))
-
-            while True:
-                await asyncio.sleep(0.01)
-                chunk = await websocket.recv()
-
-                logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
-                # print("received chunk from server")
-
-                if type(chunk) == str:
-                    chunk = json.loads(chunk)
-
-                    if chunk.get("type") == "config":
-                        self.tts_service = chunk.get("tts_service")
-                        continue
-
-                if self.tts_service == "elevenlabs":
-                    message = chunk
-                else:
-                    message = accumulator.accumulate(chunk)
-
-                if message == None:
-                    # Will be None until we have a full message ready
-                    continue
-
-                # At this point, we have our message
-                if isinstance(message, bytes) or (
-                    message["type"] == "audio" and message["format"].startswith("bytes")
-                ):
-                    # Convert bytes to audio file
-                    if self.tts_service == "elevenlabs":
-                        audio_bytes = message
-                        audio = audio_bytes
-                    else:
-                        audio_bytes = message["content"]
-
-                        # Create an AudioSegment instance with the raw data
-                        audio = AudioSegment(
-                            # raw audio data (bytes)
-                            data=audio_bytes,
-                            # signed 16-bit little-endian format
-                            sample_width=2,
-                            # 16,000 Hz frame rate
-                            frame_rate=22050,
-                            # mono sound
-                            channels=1,
-                        )
-
-                    await self.audiosegments.put(audio)
-
-                # Run the code if that's the client's job
-                if os.getenv("CODE_RUNNER") == "client":
-                    if message["type"] == "code" and "end" in message:
-                        language = message["format"]
-                        code = message["content"]
-                        result = interpreter.computer.run(language, code)
-                        send_queue.put(result)
-
-        if is_win10():
-            logger.info("Windows 10 detected")
-            # Workaround for Windows 10 not latching to the websocket server.
-            # See https://github.com/OpenInterpreter/01/issues/197
+    async def receive_audio(self):
+        self.output_stream = self.p.open(format=FORMAT, channels=CHANNELS, rate=PLAYBACK_RATE, output=True, frames_per_buffer=CHUNK)
+        while True:
            try:
-                ws = websockets.connect(WS_URL)
-                await exec_ws_communication(ws)
+                data = await self.websocket.recv()
+                if isinstance(data, bytes) and not self.recording:
+                    self.output_stream.write(data)
            except Exception as e:
-                logger.error(f"Error while attempting to connect: {e}")
-        else:
-            while True:
-                try:
-                    async with websockets.connect(WS_URL) as websocket:
-                        await exec_ws_communication(websocket)
-                except:
-                    logger.debug(traceback.format_exc())
-                    if show_connection_log:
-                        logger.info(f"Connecting to `{WS_URL}`...")
-                        show_connection_log = False
-                        await asyncio.sleep(2)
-
-    async def start_async(self):
-        # Configuration for WebSocket
-        WS_URL = f"ws://{self.server_url}"
-        # Start the WebSocket communication
-        asyncio.create_task(self.websocket_communication(WS_URL))
-
-        # Start watching the kernel if it's your job to do that
-        if os.getenv("CODE_RUNNER") == "client":
-            # client is not running code!
-            asyncio.create_task(put_kernel_messages_into_queue(send_queue))
-
-        asyncio.create_task(self.play_audiosegments())
+                print(f"Error in receive_audio: {e}")

-        # If Raspberry Pi, add the button listener, otherwise use the spacebar
-        if current_platform.startswith("raspberry-pi"):
-            logger.info("Raspberry Pi detected, using button on GPIO pin 15")
-            # Use GPIO pin 15
-            pindef = ["gpiochip4", "15"]  # gpiofind PIN15
-            print("PINDEF", pindef)
+    def on_press(self, key):
+        if key == keyboard.Key.space and not self.recording:
+            print("Space pressed, starting recording")
+            self.recording = True

-            # HACK: needs passwordless sudo
-            process = await asyncio.create_subprocess_exec(
-                "sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE
-            )
-            while True:
-                line = await process.stdout.readline()
-                if line:
-                    line = line.decode().strip()
-                    if "FALLING" in line:
-                        self.toggle_recording(False)
-                    elif "RISING" in line:
-                        self.toggle_recording(True)
-                else:
-                    break
-        else:
-            # Keyboard listener for spacebar press/release
-            listener = keyboard.Listener(
-                on_press=self.on_press, on_release=self.on_release
-            )
-            listener.start()
+    def on_release(self, key):
+        if key == keyboard.Key.space:
+            print("Space released, stopping recording")
+            self.recording = False
+        elif key == keyboard.Key.esc:
+            print("Esc pressed, stopping the program")
+            return False
+
+    async def main(self):
+        await self.connect_with_retry()
+        print("Hold spacebar to record. Press 'Esc' to quit.")
+        listener = keyboard.Listener(on_press=self.on_press, on_release=self.on_release)
+        listener.start()
+        await asyncio.gather(self.send_audio(), self.receive_audio())

    def start(self):
-        if os.getenv("TEACH_MODE") != "True":
-            asyncio.run(self.start_async())
-            p.terminate()
+        asyncio.run(self.main())
+
+if __name__ == "__main__":
+    device = Device()
+    device.start()
--- a/software/source/server/archive_async_interpreter.py
+++ b/software/source/server/archive_async_interpreter.py
@ -178,6 +178,7 @@ class AsyncInterpreter:
        """
        self.interpreter.messages = self.active_chat_messages

+        
        self.stt.stop()

        input_queue = []
--- a/software/source/server/archive_async_server.py
+++ b/software/source/server/archive_async_server.py
@ -0,0 +1,124 @@
+import asyncio
+import traceback
+import json
+from fastapi import FastAPI, WebSocket, Depends
+from fastapi.responses import PlainTextResponse
+from uvicorn import Config, Server
+from .async_interpreter import AsyncInterpreter
+from fastapi.middleware.cors import CORSMiddleware
+from typing import List, Dict, Any
+import os
+import importlib.util
+
+
+
+os.environ["STT_RUNNER"] = "server"
+os.environ["TTS_RUNNER"] = "server"
+
+app = FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],  # Allow all methods (GET, POST, etc.)
+    allow_headers=["*"],  # Allow all headers
+)
+
+
+async def get_debug_flag():
+    return app.state.debug
+
+
+@app.get("/ping")
+async def ping():
+    return PlainTextResponse("pong")
+
+
+@app.websocket("/")
+async def websocket_endpoint(
+    websocket: WebSocket, debug: bool = Depends(get_debug_flag)
+):
+    await websocket.accept()
+
+    global global_interpreter
+    interpreter = global_interpreter
+
+    # Send the tts_service value to the client
+    await websocket.send_text(
+        json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts})
+    )
+
+    try:
+
+        async def receive_input():
+            while True:
+                if websocket.client_state == "DISCONNECTED":
+                    break
+
+                data = await websocket.receive()
+
+                await asyncio.sleep(0)
+
+                if isinstance(data, bytes):
+                    await interpreter.input(data)
+                elif "bytes" in data:
+                    await interpreter.input(data["bytes"])
+                    # print("RECEIVED INPUT", data)
+                elif "text" in data:
+                    # print("RECEIVED INPUT", data)
+                    await interpreter.input(data["text"])
+
+        async def send_output():
+            while True:
+                output = await interpreter.output()
+
+                await asyncio.sleep(0)
+
+                if isinstance(output, bytes):
+                    # print(f"Sending {len(output)} bytes of audio data.")
+                    await websocket.send_bytes(output)
+
+                elif isinstance(output, dict):
+                    # print("sending text")
+                    await websocket.send_text(json.dumps(output))
+
+        await asyncio.gather(send_output(), receive_input())
+    except Exception as e:
+        print(f"WebSocket connection closed with exception: {e}")
+        traceback.print_exc()
+    finally:
+        if not websocket.client_state == "DISCONNECTED":
+            await websocket.close()
+
+
+async def main(server_host, server_port, profile, debug):
+
+    app.state.debug = debug
+
+    # Load the profile module from the provided path
+    spec = importlib.util.spec_from_file_location("profile", profile)
+    profile_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(profile_module)
+
+    # Get the interpreter from the profile
+    interpreter = profile_module.interpreter
+
+    if not hasattr(interpreter, 'tts'):
+        print("Setting TTS provider to default: openai")
+        interpreter.tts = "openai"
+
+    # Make it async
+    interpreter = AsyncInterpreter(interpreter, debug)
+
+    global global_interpreter
+    global_interpreter = interpreter
+
+    print(f"Starting server on {server_host}:{server_port}")
+    config = Config(app, host=server_host, port=server_port, lifespan="on")
+    server = Server(config)
+    await server.serve()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/software/source/server/async_server.py
+++ b/software/source/server/async_server.py
@ -1,100 +1,16 @@
-import asyncio
+import importlib
 import traceback
 import json
-from fastapi import FastAPI, WebSocket, Depends
-from fastapi.responses import PlainTextResponse
-from uvicorn import Config, Server
-from .async_interpreter import AsyncInterpreter
-from fastapi.middleware.cors import CORSMiddleware
-from typing import List, Dict, Any
 import os
-import importlib.util
-
-
-
-os.environ["STT_RUNNER"] = "server"
-os.environ["TTS_RUNNER"] = "server"
-
-app = FastAPI()
-
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],  # Allow all methods (GET, POST, etc.)
-    allow_headers=["*"],  # Allow all headers
-)
-
-
-async def get_debug_flag():
-    return app.state.debug
-
-
-@app.get("/ping")
-async def ping():
-    return PlainTextResponse("pong")
-
-
-@app.websocket("/")
-async def websocket_endpoint(
-    websocket: WebSocket, debug: bool = Depends(get_debug_flag)
-):
-    await websocket.accept()
-
-    global global_interpreter
-    interpreter = global_interpreter
-
-    # Send the tts_service value to the client
-    await websocket.send_text(
-        json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts})
-    )
-
-    try:
-
-        async def receive_input():
-            while True:
-                if websocket.client_state == "DISCONNECTED":
-                    break
-
-                data = await websocket.receive()
-
-                await asyncio.sleep(0)
-
-                if isinstance(data, bytes):
-                    await interpreter.input(data)
-                elif "bytes" in data:
-                    await interpreter.input(data["bytes"])
-                    # print("RECEIVED INPUT", data)
-                elif "text" in data:
-                    # print("RECEIVED INPUT", data)
-                    await interpreter.input(data["text"])
-
-        async def send_output():
-            while True:
-                output = await interpreter.output()
-
-                await asyncio.sleep(0)
-
-                if isinstance(output, bytes):
-                    # print(f"Sending {len(output)} bytes of audio data.")
-                    await websocket.send_bytes(output)
-
-                elif isinstance(output, dict):
-                    # print("sending text")
-                    await websocket.send_text(json.dumps(output))
-
-        await asyncio.gather(send_output(), receive_input())
-    except Exception as e:
-        print(f"WebSocket connection closed with exception: {e}")
-        traceback.print_exc()
-    finally:
-        if not websocket.client_state == "DISCONNECTED":
-            await websocket.close()
-
-
-async def main(server_host, server_port, profile, debug):
+from RealtimeTTS import TextToAudioStream, CoquiEngine, OpenAIEngine, ElevenlabsEngine
+from RealtimeSTT import AudioToTextRecorder
+import types
+import time
+import wave
+import asyncio
+from fastapi.responses import PlainTextResponse

-    app.state.debug = debug
+def start_server(server_host, server_port, profile, debug):

    # Load the profile module from the provided path
    spec = importlib.util.spec_from_file_location("profile", profile)
@ -104,21 +20,106 @@ async def main(server_host, server_port, profile, debug):
    # Get the interpreter from the profile
    interpreter = profile_module.interpreter

+    # STT
+    interpreter.stt = AudioToTextRecorder(
+        model="tiny.en", spinner=False, use_microphone=False
+    )
+    interpreter.stt.stop()  # It needs this for some reason
+
+
+    # TTS
    if not hasattr(interpreter, 'tts'):
        print("Setting TTS provider to default: openai")
        interpreter.tts = "openai"
-
-    # Make it async
-    interpreter = AsyncInterpreter(interpreter, debug)
-
-    global global_interpreter
-    global_interpreter = interpreter
-
-    print(f"Starting server on {server_host}:{server_port}")
-    config = Config(app, host=server_host, port=server_port, lifespan="on")
-    server = Server(config)
-    await server.serve()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
+    if interpreter.tts == "coqui":
+        engine = CoquiEngine()
+    elif interpreter.tts == "openai":
+        engine = OpenAIEngine(voice="onyx")
+    elif interpreter.tts == "elevenlabs":
+        engine = ElevenlabsEngine(api_key=os.environ["ELEVEN_LABS_API_KEY"])
+        engine.set_voice("Michael")
+    else:
+        raise ValueError(f"Unsupported TTS engine: {interpreter.interpreter.tts}")
+    interpreter.tts = TextToAudioStream(engine)
+
+    # Misc Settings
+    interpreter.verbose = debug
+    interpreter.server.host = server_host
+    interpreter.server.port = server_port
+
+
+    interpreter.audio_chunks = []
+
+
+    old_input = interpreter.input
+    old_output = interpreter.output
+
+
+    async def new_input(self, chunk):
+        await asyncio.sleep(0)
+        if isinstance(chunk, bytes):
+            self.stt.feed_audio(chunk)
+            self.audio_chunks.append(chunk)
+        elif isinstance(chunk, dict):
+            if "start" in chunk:
+                self.stt.start()
+                self.audio_chunks = []
+                await old_input({"role": "user", "type": "message", "start": True})
+            if "end" in chunk:
+                self.stt.stop()
+                content = self.stt.text()
+
+                print("User: ", content)
+
+                if False:
+                    audio_bytes = bytearray(b"".join(self.audio_chunks))
+                    with wave.open('audio.wav', 'wb') as wav_file:
+                        wav_file.setnchannels(1)
+                        wav_file.setsampwidth(2)  # Assuming 16-bit audio
+                        wav_file.setframerate(16000)  # Assuming 16kHz sample rate
+                        wav_file.writeframes(audio_bytes)
+                    print(os.path.abspath('audio.wav'))
+
+                await old_input({"role": "user", "type": "message", "content": content})
+                await old_input({"role": "user", "type": "message", "end": True})
+
+
+    async def new_output(self):
+        while True:
+            output = await old_output()
+            # if output == {"role": "assistant", "type": "message", "start": True}:
+            #     return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
+
+            if isinstance(output, bytes):
+                return output
+
+            await asyncio.sleep(0)
+
+            delimiters = ".?!;,\n…)]}"
+
+            if output["type"] == "message" and len(output.get("content", "")) > 0:
+                self.tts.feed(output.get("content"))
+                if not self.tts.is_playing() and any([c in delimiters for c in output.get("content")]): # Start playing once the first delimiter is encountered.
+                    self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True, sentence_fragment_delimiters=delimiters)
+                    return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
+
+            if output == {"role": "assistant", "type": "message", "end": True}:
+                if not self.tts.is_playing(): # We put this here in case it never outputs a delimiter and never triggers play_async^
+                    self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True, sentence_fragment_delimiters=delimiters)
+                    return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
+                return {"role": "assistant", "type": "audio", "format": "bytes.wav", "end": True}
+
+    def on_tts_chunk(self, chunk):
+        self.output_queue.sync_q.put(chunk)
+
+    # Wrap in voice interface
+    interpreter.input = types.MethodType(new_input, interpreter)
+    interpreter.output = types.MethodType(new_output, interpreter)
+    interpreter.on_tts_chunk = types.MethodType(on_tts_chunk, interpreter)
+
+    @interpreter.server.app.get("/ping")
+    async def ping():
+        return PlainTextResponse("pong")
+
+    # Start server
+    interpreter.server.run()
--- a/software/source/server/profiles/default.py
+++ b/software/source/server/profiles/default.py
@ -1,4 +1,5 @@
-from interpreter import interpreter
+from interpreter import AsyncInterpreter
+interpreter = AsyncInterpreter()

 # This is an Open Interpreter compatible profile.
 # Visit https://01.openinterpreter.com/profile for all options.
--- a/software/source/server/profiles/fast.py
+++ b/software/source/server/profiles/fast.py
@ -1,4 +1,5 @@
-from interpreter import interpreter
+from interpreter import AsyncInterpreter
+interpreter = AsyncInterpreter()

 # This is an Open Interpreter compatible profile.
 # Visit https://01.openinterpreter.com/profile for all options.
@ -7,14 +8,12 @@ from interpreter import interpreter
 # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
 interpreter.tts = "elevenlabs"

-# 01 Language Model Config.
-interpreter.llm_service = "litellm"
-interpreter.llm.model = "groq/llama3-8b-8192"
+interpreter.llm.model = "groq/llama3-70b-8192"
 interpreter.llm.supports_vision = False
 interpreter.llm.supports_functions = False
-interpreter.llm.context_window = 2048
-interpreter.llm.max_tokens = 4096
-interpreter.llm.temperature = 0.8
+interpreter.llm.context_window = 8000
+interpreter.llm.max_tokens = 1000
+interpreter.llm.temperature = 0

 interpreter.computer.import_computer_api = False

--- a/software/source/server/profiles/local.py
+++ b/software/source/server/profiles/local.py
@ -1,38 +1,71 @@
-from interpreter import interpreter
+from interpreter import AsyncInterpreter
+interpreter = AsyncInterpreter()

 # 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
 # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
 interpreter.tts = "coqui"

-# Local setup
-interpreter.local_setup()
-
 interpreter.system_message = """You are an AI assistant that writes markdown code snippets to answer the user's request. You speak very concisely and quickly, you say nothing irrelevant to the user's request. For example:

 User: Open the chrome app.
-Assistant: On it.
+Assistant: On it. 
 ```python
 import webbrowser
 webbrowser.open('https://chrome.google.com')
 ```
 User: The code you ran produced no output. Was this expected, or are we finished?
 Assistant: No further action is required; the provided snippet opens Chrome.
+User: How large are all the files on my desktop combined?
+Assistant: I will sum up the file sizes of every file on your desktop.
+```python
+import os
+import string
+from pathlib import Path
+
+# Get the user's home directory in a cross-platform way
+home_dir = Path.home()
+
+# Define the path to the desktop
+desktop_dir = home_dir / 'Desktop'
+
+# Initialize a variable to store the total size
+total_size = 0

-Now, your turn:"""
+# Loop through all files on the desktop
+for file in desktop_dir.iterdir():
+    # Add the file size to the total
+    total_size += file.stat().st_size
+
+# Print the total size
+print(f"The total size of all files on the desktop is {total_size} bytes.")
+```
+User: I executed that code. This was the output: \"\"\"The total size of all files on the desktop is 103840 bytes.\"\"\"\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.
+Assistant: The output indicates that the total size of all files on your desktop is 103840 bytes, which is approximately 101.4 KB or 0.1 MB. We are finished.
+
+NEVER use placeholders, NEVER say "path/to/desktop", NEVER say "path/to/file". Always specify exact paths, and use cross-platform ways of determining the desktop, documents, cwd, etc. folders.
+
+Now, your turn:""".strip()

 # Message templates
 interpreter.code_output_template = '''I executed that code. This was the output: """{content}"""\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.'''
 interpreter.empty_code_output_template = "The code above was executed on my machine. It produced no text output. What's next (if anything, or are we done?)"
 interpreter.code_output_sender = "user"

+# LLM settings
+interpreter.llm.model = "ollama/codestral"
+interpreter.llm.supports_functions = False
+interpreter.llm.execution_instructions = False
+interpreter.llm.load()
+
 # Computer settings
 interpreter.computer.import_computer_api = False

 # Misc settings
-interpreter.auto_run = False
+interpreter.auto_run = True
 interpreter.offline = True
+interpreter.max_output = 600

 # Final message
 interpreter.display_message(
-    f"> Model set to `{interpreter.llm.model}`\n\n**Open Interpreter** will require approval before running code.\n\nUse `interpreter -y` to bypass this.\n\nPress `CTRL-C` to exit.\n"
+    "> Local model set to `Codestral`, Local TTS set to `Coqui`.\n"
 )
--- a/software/start.py
+++ b/software/start.py
@ -5,7 +5,7 @@ import threading
 import os
 import importlib
 from source.server.tunnel import create_tunnel
-from source.server.async_server import main
+from source.server.async_server import start_server
 import subprocess

 import signal
@ -134,17 +134,13 @@ def _run(
    signal.signal(signal.SIGINT, handle_exit)

    if server:
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
        server_thread = threading.Thread(
-            target=loop.run_until_complete,
+            target=start_server,
            args=(
-                main(
-                    server_host,
-                    server_port,
-                    profile,
-                    debug,
-                ),
+                server_host,
+                server_port,
+                profile,
+                debug,
            ),
        )
        server_thread.start()