diff --git a/README.md b/README.md index c29099b..6ab2af8 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,9 @@ If you want to run local speech-to-text using Whisper, you must install Rust. Fo ## Customizations -To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in `i.py`. This file sets up an interpreter, and is powered by Open Interpreter. +To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in the `profiles` directory under the `server` directory. This file sets up an interpreter, and is powered by Open Interpreter. + +To specify the text-to-speech service for the 01 `base_device.py`, set `interpreter.tts` to either "openai" for OpenAI, "elevenlabs" for ElevenLabs, or "coqui" for Coqui (local) in a profile. ## Ubuntu Dependencies diff --git a/software/source/clients/base_device.py b/software/source/clients/base_device.py index 6d2cdfd..88eac6b 100644 --- a/software/source/clients/base_device.py +++ b/software/source/clients/base_device.py @@ -91,7 +91,6 @@ class Device: self.server_url = "" self.ctrl_pressed = False self.tts_service = "" - self.playback_latency = None def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX): """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list.""" @@ -165,10 +164,6 @@ class Device: while True: try: audio = await self.audiosegments.get() - if self.playback_latency and isinstance(audio, bytes): - elapsed_time = time.time() - self.playback_latency - print(f"Time from request to playback: {elapsed_time} seconds") - self.playback_latency = None if self.tts_service == "elevenlabs": mpv_process.stdin.write(audio) # type: ignore @@ -224,7 +219,6 @@ class Device: stream.stop_stream() stream.close() print("Recording stopped.") - self.playback_latency = time.time() duration = wav_file.getnframes() / RATE if duration < 0.3: diff --git a/software/source/server/async_interpreter.py b/software/source/server/async_interpreter.py index 25b0720..1251923 100644 --- a/software/source/server/async_interpreter.py +++ b/software/source/server/async_interpreter.py @@ -22,11 +22,6 @@ import os class AsyncInterpreter: def __init__(self, interpreter): - self.stt_latency = None - self.tts_latency = None - self.interpreter_latency = None - self.time_from_first_yield_to_first_put = None - self.interpreter = interpreter # STT @@ -128,9 +123,7 @@ class AsyncInterpreter: # Experimental: The AI voice sounds better with replacements like these, but it should happen at the TTS layer # content = content.replace(". ", ". ... ").replace(", ", ", ... ").replace("!", "! ... ").replace("?", "? ... ") - print("yielding ", content) - if self.time_from_first_yield_to_first_put is None: - self.time_from_first_yield_to_first_put = time.time() + # print("yielding ", content) yield content @@ -162,9 +155,6 @@ class AsyncInterpreter: ) # Send a completion signal - end_interpreter = time.time() - self.interpreter_latency = end_interpreter - start_interpreter - print("INTERPRETER LATENCY", self.interpreter_latency) # self.add_to_output_queue_sync({"role": "server","type": "completion", "content": "DONE"}) async def run(self): @@ -179,11 +169,7 @@ class AsyncInterpreter: while not self._input_queue.empty(): input_queue.append(self._input_queue.get()) - start_stt = time.time() message = self.stt.text() - end_stt = time.time() - self.stt_latency = end_stt - start_stt - print("STT LATENCY", self.stt_latency) print(message) @@ -210,23 +196,11 @@ class AsyncInterpreter: "end": True, } ) - end_tts = time.time() - self.tts_latency = end_tts - self.tts.stream_start_time - print("TTS LATENCY", self.tts_latency) self.tts.stop() break async def _on_tts_chunk_async(self, chunk): - print("adding chunk to queue") - if ( - self.time_from_first_yield_to_first_put is not None - and self.time_from_first_yield_to_first_put != 0 - ): - print( - "time from first yield to first put is ", - time.time() - self.time_from_first_yield_to_first_put, - ) - self.time_from_first_yield_to_first_put = 0 + # print("adding chunk to queue") await self._add_to_queue(self._output_queue, chunk) def on_tts_chunk(self, chunk): @@ -234,8 +208,5 @@ class AsyncInterpreter: asyncio.run(self._on_tts_chunk_async(chunk)) async def output(self): - print("outputting chunks") + # print("outputting chunks") return await self._output_queue.get() - - def shutdown(self): - self.stt.shutdown() diff --git a/software/source/server/async_server.py b/software/source/server/async_server.py index 139bbcc..53f38ed 100644 --- a/software/source/server/async_server.py +++ b/software/source/server/async_server.py @@ -1,9 +1,13 @@ -# TODO: import from the profiles directory the interpreter that should be served!! +# import from the profiles directory the interpreter to be served -from .profiles.fast import interpreter as base_interpreter +# add other profiles to the directory to define other interpreter instances and import them here +# {.profiles.fast: optimizes for STT/TTS latency with the fastest models } +# {.profiles.local: uses local models and local STT/TTS } +# {.profiles.default: uses default interpreter settings with optimized TTS latency } +# from .profiles.fast import interpreter as base_interpreter # from .profiles.local import interpreter as base_interpreter -# from .profiles.default import interpreter as base_interpreter +from .profiles.default import interpreter as base_interpreter import asyncio import traceback diff --git a/software/source/server/conftest.py b/software/source/server/conftest.py index 82dacde..badf160 100644 --- a/software/source/server/conftest.py +++ b/software/source/server/conftest.py @@ -1,3 +1,5 @@ +# tests currently hang after completion + """ import pytest import signal diff --git a/software/source/server/profiles/default.py b/software/source/server/profiles/default.py index 80eb94a..92d86a3 100644 --- a/software/source/server/profiles/default.py +++ b/software/source/server/profiles/default.py @@ -3,9 +3,9 @@ from interpreter import interpreter # This is an Open Interpreter compatible profile. # Visit https://01.openinterpreter.com/profile for all options. -# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers +# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} -interpreter.tts = "openai" +interpreter.tts = "elevenlabs" # Connect your 01 to a language model interpreter.llm.model = "gpt-4-turbo" diff --git a/software/source/server/profiles/fast.py b/software/source/server/profiles/fast.py index 1fe274b..c8317b4 100644 --- a/software/source/server/profiles/fast.py +++ b/software/source/server/profiles/fast.py @@ -3,7 +3,7 @@ from interpreter import interpreter # This is an Open Interpreter compatible profile. # Visit https://01.openinterpreter.com/profile for all options. -# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers +# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} interpreter.tts = "elevenlabs" @@ -16,27 +16,9 @@ interpreter.llm.context_window = 2048 interpreter.llm.max_tokens = 4096 interpreter.llm.temperature = 0.8 -# interpreter.llm.api_key = os.environ["GROQ_API_KEY"] - interpreter.computer.import_computer_api = False interpreter.auto_run = True interpreter.system_message = ( "You are a helpful assistant that can answer questions and help with tasks." ) - -# TODO: include other options in comments in the profiles for tts -# direct people to the profiles directory to make changes to the interpreter profile -# this should be made explicit on the docs - -""" - llm_service: str = "litellm", - model: str = "gpt-4", - llm_supports_vision: bool = False, - llm_supports_functions: bool = False, - context_window: int = 2048, - max_tokens: int = 4096, - temperature: float = 0.8, - tts_service: str = "elevenlabs", - stt_service: str = "openai", -""" diff --git a/software/source/server/profiles/local.py b/software/source/server/profiles/local.py index de58f75..c7db1e5 100644 --- a/software/source/server/profiles/local.py +++ b/software/source/server/profiles/local.py @@ -1,6 +1,6 @@ from interpreter import interpreter -# 01 suports OpenAI, ElevenLabs, and Coqui (Local) TTS providers +# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} interpreter.tts = "coqui"