From 564255adee18eae316938938ea83b0eacc44e8eb Mon Sep 17 00:00:00 2001
From: Ben Xu <benx.xu@mail.utoronto.ca>
Date: Wed, 19 Jun 2024 15:15:58 -0700
Subject: [PATCH] update docs and remove comments

---
 README.md                                   |  4 ++-
 software/source/clients/base_device.py      |  6 ----
 software/source/server/async_interpreter.py | 35 ++-------------------
 software/source/server/async_server.py      | 10 ++++--
 software/source/server/conftest.py          |  2 ++
 software/source/server/profiles/default.py  |  4 +--
 software/source/server/profiles/fast.py     | 20 +-----------
 software/source/server/profiles/local.py    |  2 +-
 8 files changed, 19 insertions(+), 64 deletions(-)

diff --git a/README.md b/README.md
index c29099b..6ab2af8 100644
--- a/README.md
+++ b/README.md
@@ -127,7 +127,9 @@ If you want to run local speech-to-text using Whisper, you must install Rust. Fo
 
 ## Customizations
 
-To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in `i.py`. This file sets up an interpreter, and is powered by Open Interpreter.
+To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in the `profiles` directory under the `server` directory. This file sets up an interpreter, and is powered by Open Interpreter.
+
+To specify the text-to-speech service for the 01 `base_device.py`, set `interpreter.tts` to either "openai" for OpenAI, "elevenlabs" for ElevenLabs, or "coqui" for Coqui (local) in a profile.
 
 ## Ubuntu Dependencies
 
diff --git a/software/source/clients/base_device.py b/software/source/clients/base_device.py
index 6d2cdfd..88eac6b 100644
--- a/software/source/clients/base_device.py
+++ b/software/source/clients/base_device.py
@@ -91,7 +91,6 @@ class Device:
         self.server_url = ""
         self.ctrl_pressed = False
         self.tts_service = ""
-        self.playback_latency = None
 
     def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
         """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
@@ -165,10 +164,6 @@ class Device:
         while True:
             try:
                 audio = await self.audiosegments.get()
-                if self.playback_latency and isinstance(audio, bytes):
-                    elapsed_time = time.time() - self.playback_latency
-                    print(f"Time from request to playback: {elapsed_time} seconds")
-                    self.playback_latency = None
 
                 if self.tts_service == "elevenlabs":
                     mpv_process.stdin.write(audio)  # type: ignore
@@ -224,7 +219,6 @@ class Device:
         stream.stop_stream()
         stream.close()
         print("Recording stopped.")
-        self.playback_latency = time.time()
 
         duration = wav_file.getnframes() / RATE
         if duration < 0.3:
diff --git a/software/source/server/async_interpreter.py b/software/source/server/async_interpreter.py
index 25b0720..1251923 100644
--- a/software/source/server/async_interpreter.py
+++ b/software/source/server/async_interpreter.py
@@ -22,11 +22,6 @@ import os
 
 class AsyncInterpreter:
     def __init__(self, interpreter):
-        self.stt_latency = None
-        self.tts_latency = None
-        self.interpreter_latency = None
-        self.time_from_first_yield_to_first_put = None
-
         self.interpreter = interpreter
 
         # STT
@@ -128,9 +123,7 @@ class AsyncInterpreter:
 
                     # Experimental: The AI voice sounds better with replacements like these, but it should happen at the TTS layer
                     # content = content.replace(". ", ". ... ").replace(", ", ", ... ").replace("!", "! ... ").replace("?", "? ... ")
-                    print("yielding ", content)
-                    if self.time_from_first_yield_to_first_put is None:
-                        self.time_from_first_yield_to_first_put = time.time()
+                    # print("yielding ", content)
 
                     yield content
 
@@ -162,9 +155,6 @@ class AsyncInterpreter:
                         )
 
         # Send a completion signal
-        end_interpreter = time.time()
-        self.interpreter_latency = end_interpreter - start_interpreter
-        print("INTERPRETER LATENCY", self.interpreter_latency)
         # self.add_to_output_queue_sync({"role": "server","type": "completion", "content": "DONE"})
 
     async def run(self):
@@ -179,11 +169,7 @@ class AsyncInterpreter:
         while not self._input_queue.empty():
             input_queue.append(self._input_queue.get())
 
-        start_stt = time.time()
         message = self.stt.text()
-        end_stt = time.time()
-        self.stt_latency = end_stt - start_stt
-        print("STT LATENCY", self.stt_latency)
 
         print(message)
 
@@ -210,23 +196,11 @@ class AsyncInterpreter:
                         "end": True,
                     }
                 )
-                end_tts = time.time()
-                self.tts_latency = end_tts - self.tts.stream_start_time
-                print("TTS LATENCY", self.tts_latency)
                 self.tts.stop()
                 break
 
     async def _on_tts_chunk_async(self, chunk):
-        print("adding chunk to queue")
-        if (
-            self.time_from_first_yield_to_first_put is not None
-            and self.time_from_first_yield_to_first_put != 0
-        ):
-            print(
-                "time from first yield to first put is ",
-                time.time() - self.time_from_first_yield_to_first_put,
-            )
-            self.time_from_first_yield_to_first_put = 0
+        # print("adding chunk to queue")
         await self._add_to_queue(self._output_queue, chunk)
 
     def on_tts_chunk(self, chunk):
@@ -234,8 +208,5 @@ class AsyncInterpreter:
         asyncio.run(self._on_tts_chunk_async(chunk))
 
     async def output(self):
-        print("outputting chunks")
+        # print("outputting chunks")
         return await self._output_queue.get()
-
-    def shutdown(self):
-        self.stt.shutdown()
diff --git a/software/source/server/async_server.py b/software/source/server/async_server.py
index 139bbcc..53f38ed 100644
--- a/software/source/server/async_server.py
+++ b/software/source/server/async_server.py
@@ -1,9 +1,13 @@
-# TODO: import from the profiles directory the interpreter that should be served!!
+# import from the profiles directory the interpreter to be served
 
-from .profiles.fast import interpreter as base_interpreter
+# add other profiles to the directory to define other interpreter instances and import them here
+# {.profiles.fast: optimizes for STT/TTS latency with the fastest models }
+# {.profiles.local: uses local models and local STT/TTS }
+# {.profiles.default: uses default interpreter settings with optimized TTS latency }
 
+# from .profiles.fast import interpreter as base_interpreter
 # from .profiles.local import interpreter as base_interpreter
-# from .profiles.default import interpreter as base_interpreter
+from .profiles.default import interpreter as base_interpreter
 
 import asyncio
 import traceback
diff --git a/software/source/server/conftest.py b/software/source/server/conftest.py
index 82dacde..badf160 100644
--- a/software/source/server/conftest.py
+++ b/software/source/server/conftest.py
@@ -1,3 +1,5 @@
+# tests currently hang after completion
+
 """
 import pytest
 import signal
diff --git a/software/source/server/profiles/default.py b/software/source/server/profiles/default.py
index 80eb94a..92d86a3 100644
--- a/software/source/server/profiles/default.py
+++ b/software/source/server/profiles/default.py
@@ -3,9 +3,9 @@ from interpreter import interpreter
 # This is an Open Interpreter compatible profile.
 # Visit https://01.openinterpreter.com/profile for all options.
 
-# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
+# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
 # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
-interpreter.tts = "openai"
+interpreter.tts = "elevenlabs"
 
 # Connect your 01 to a language model
 interpreter.llm.model = "gpt-4-turbo"
diff --git a/software/source/server/profiles/fast.py b/software/source/server/profiles/fast.py
index 1fe274b..c8317b4 100644
--- a/software/source/server/profiles/fast.py
+++ b/software/source/server/profiles/fast.py
@@ -3,7 +3,7 @@ from interpreter import interpreter
 # This is an Open Interpreter compatible profile.
 # Visit https://01.openinterpreter.com/profile for all options.
 
-# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
+# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
 # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
 interpreter.tts = "elevenlabs"
 
@@ -16,27 +16,9 @@ interpreter.llm.context_window = 2048
 interpreter.llm.max_tokens = 4096
 interpreter.llm.temperature = 0.8
 
-# interpreter.llm.api_key = os.environ["GROQ_API_KEY"]
-
 interpreter.computer.import_computer_api = False
 
 interpreter.auto_run = True
 interpreter.system_message = (
     "You are a helpful assistant that can answer questions and help with tasks."
 )
-
-# TODO: include other options in comments in the profiles for tts
-# direct people to the profiles directory to make changes to the interpreter profile
-# this should be made explicit on the docs
-
-"""
-    llm_service: str = "litellm",
-    model: str = "gpt-4",
-    llm_supports_vision: bool = False,
-    llm_supports_functions: bool = False,
-    context_window: int = 2048,
-    max_tokens: int = 4096,
-    temperature: float = 0.8,
-    tts_service: str = "elevenlabs",
-    stt_service: str = "openai",
-"""
diff --git a/software/source/server/profiles/local.py b/software/source/server/profiles/local.py
index de58f75..c7db1e5 100644
--- a/software/source/server/profiles/local.py
+++ b/software/source/server/profiles/local.py
@@ -1,6 +1,6 @@
 from interpreter import interpreter
 
-# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
+# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
 # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
 interpreter.tts = "coqui"