From 72f41ad7606aafbdca48f06553d100e350c7c7f3 Mon Sep 17 00:00:00 2001 From: Ben Xu Date: Mon, 24 Jun 2024 09:17:20 -0400 Subject: [PATCH 1/6] add debug flag --- software/source/clients/base_device.py | 8 ++++ software/source/clients/linux/device.py | 3 +- software/source/clients/mac/device.py | 3 +- software/source/clients/windows/device.py | 3 +- software/source/server/async_interpreter.py | 46 +++++++++++++++++---- software/source/server/async_server.py | 19 ++++++--- software/start.py | 10 ++++- 7 files changed, 75 insertions(+), 17 deletions(-) diff --git a/software/source/clients/base_device.py b/software/source/clients/base_device.py index 88eac6b..34b96a1 100644 --- a/software/source/clients/base_device.py +++ b/software/source/clients/base_device.py @@ -91,6 +91,8 @@ class Device: self.server_url = "" self.ctrl_pressed = False self.tts_service = "" + self.debug = False + self.playback_latency = None def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX): """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list.""" @@ -164,6 +166,10 @@ class Device: while True: try: audio = await self.audiosegments.get() + if self.debug and self.playback_latency and isinstance(audio, bytes): + elapsed_time = time.time() - self.playback_latency + print(f"Time from request to playback: {elapsed_time} seconds") + self.playback_latency = None if self.tts_service == "elevenlabs": mpv_process.stdin.write(audio) # type: ignore @@ -219,6 +225,8 @@ class Device: stream.stop_stream() stream.close() print("Recording stopped.") + if self.debug: + self.playback_latency = time.time() duration = wav_file.getnframes() / RATE if duration < 0.3: diff --git a/software/source/clients/linux/device.py b/software/source/clients/linux/device.py index 0fa0fed..36182fb 100644 --- a/software/source/clients/linux/device.py +++ b/software/source/clients/linux/device.py @@ -3,8 +3,9 @@ from ..base_device import Device device = Device() -def main(server_url): +def main(server_url, debug): device.server_url = server_url + device.debug = debug device.start() diff --git a/software/source/clients/mac/device.py b/software/source/clients/mac/device.py index 0fa0fed..36182fb 100644 --- a/software/source/clients/mac/device.py +++ b/software/source/clients/mac/device.py @@ -3,8 +3,9 @@ from ..base_device import Device device = Device() -def main(server_url): +def main(server_url, debug): device.server_url = server_url + device.debug = debug device.start() diff --git a/software/source/clients/windows/device.py b/software/source/clients/windows/device.py index 0fa0fed..36182fb 100644 --- a/software/source/clients/windows/device.py +++ b/software/source/clients/windows/device.py @@ -3,8 +3,9 @@ from ..base_device import Device device = Device() -def main(server_url): +def main(server_url, debug): device.server_url = server_url + device.debug = debug device.start() diff --git a/software/source/server/async_interpreter.py b/software/source/server/async_interpreter.py index 03a4249..7b3f6ae 100644 --- a/software/source/server/async_interpreter.py +++ b/software/source/server/async_interpreter.py @@ -21,7 +21,13 @@ import os class AsyncInterpreter: - def __init__(self, interpreter): + def __init__(self, interpreter, debug): + self.stt_latency = None + self.tts_latency = None + self.interpreter_latency = None + self.tffytfp = None + self.debug = debug + self.interpreter = interpreter self.audio_chunks = [] @@ -126,6 +132,8 @@ class AsyncInterpreter: # Experimental: The AI voice sounds better with replacements like these, but it should happen at the TTS layer # content = content.replace(". ", ". ... ").replace(", ", ", ... ").replace("!", "! ... ").replace("?", "? ... ") # print("yielding ", content) + if self.time_from_first_yield_to_first_put is None: + self.time_from_first_yield_to_first_put = time.time() yield content @@ -157,6 +165,10 @@ class AsyncInterpreter: ) # Send a completion signal + if self.debug: + end_interpreter = time.time() + self.interpreter_latency = end_interpreter - start_interpreter + print("INTERPRETER LATENCY", self.interpreter_latency) # self.add_to_output_queue_sync({"role": "server","type": "completion", "content": "DONE"}) async def run(self): @@ -171,13 +183,20 @@ class AsyncInterpreter: while not self._input_queue.empty(): input_queue.append(self._input_queue.get()) - message = self.stt.text() - - if self.audio_chunks: - audio_bytes = bytearray(b"".join(self.audio_chunks)) - wav_file_path = bytes_to_wav(audio_bytes, "audio/raw") - print("wav_file_path ", wav_file_path) - self.audio_chunks = [] + if self.debug: + start_stt = time.time() + message = self.stt.text() + end_stt = time.time() + self.stt_latency = end_stt - start_stt + print("STT LATENCY", self.stt_latency) + + if self.audio_chunks: + audio_bytes = bytearray(b"".join(self.audio_chunks)) + wav_file_path = bytes_to_wav(audio_bytes, "audio/raw") + print("wav_file_path ", wav_file_path) + self.audio_chunks = [] + else: + message = self.stt.text() print(message) @@ -204,11 +223,22 @@ class AsyncInterpreter: "end": True, } ) + if self.debug: + end_tts = time.time() + self.tts_latency = end_tts - self.tts.stream_start_time + print("TTS LATENCY", self.tts_latency) self.tts.stop() + break async def _on_tts_chunk_async(self, chunk): # print("adding chunk to queue") + if self.debug and self.tffytfp is not None and self.tffytfp != 0: + print( + "time from first yield to first put is ", + time.time() - self.tffytfp, + ) + self.tffytfp = 0 await self._add_to_queue(self._output_queue, chunk) def on_tts_chunk(self, chunk): diff --git a/software/source/server/async_server.py b/software/source/server/async_server.py index 53f38ed..8bb91a3 100644 --- a/software/source/server/async_server.py +++ b/software/source/server/async_server.py @@ -12,7 +12,7 @@ from .profiles.default import interpreter as base_interpreter import asyncio import traceback import json -from fastapi import FastAPI, WebSocket +from fastapi import FastAPI, WebSocket, Depends from fastapi.responses import PlainTextResponse from uvicorn import Config, Server from .async_interpreter import AsyncInterpreter @@ -23,8 +23,6 @@ import os os.environ["STT_RUNNER"] = "server" os.environ["TTS_RUNNER"] = "server" -# interpreter.tts set in the profiles directory!!!! -interpreter = AsyncInterpreter(base_interpreter) app = FastAPI() @@ -37,15 +35,24 @@ app.add_middleware( ) +async def get_debug_flag(): + return app.state.debug + + @app.get("/ping") async def ping(): return PlainTextResponse("pong") @app.websocket("/") -async def websocket_endpoint(websocket: WebSocket): +async def websocket_endpoint( + websocket: WebSocket, debug: bool = Depends(get_debug_flag) +): await websocket.accept() + # interpreter.tts set in the profiles directory!!!! + interpreter = AsyncInterpreter(base_interpreter, debug) + # Send the tts_service value to the client await websocket.send_text( json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts}) @@ -91,7 +98,9 @@ async def websocket_endpoint(websocket: WebSocket): await websocket.close() -async def main(server_host, server_port): +async def main(server_host, server_port, debug): + app.state.debug = debug + print(f"Starting server on {server_host}:{server_port}") config = Config(app, host=server_host, port=server_port, lifespan="on") server = Server(config) diff --git a/software/start.py b/software/start.py index f93d872..0808b0f 100644 --- a/software/start.py +++ b/software/start.py @@ -41,6 +41,11 @@ def run( qr: bool = typer.Option( False, "--qr", help="Display QR code to scan to connect to the server" ), + debug: bool = typer.Option( + False, + "--debug", + help="Print latency measurements and save microphone recordings locally for manual playback.", + ), ): _run( server=server, @@ -52,6 +57,7 @@ def run( server_url=server_url, client_type=client_type, qr=qr, + debug=debug, ) @@ -65,6 +71,7 @@ def _run( server_url: str = None, client_type: str = "auto", qr: bool = False, + debug: bool = False, ): system_type = platform.system() @@ -93,6 +100,7 @@ def _run( main( server_host, server_port, + debug, ), ), ) @@ -125,7 +133,7 @@ def _run( f".clients.{client_type}.device", package="source" ) - client_thread = threading.Thread(target=module.main, args=[server_url]) + client_thread = threading.Thread(target=module.main, args=[server_url, debug]) client_thread.start() try: From 1c4be961c278e75f1363aebdf4d249056e4ded2a Mon Sep 17 00:00:00 2001 From: Ben Xu Date: Mon, 24 Jun 2024 12:58:16 -0400 Subject: [PATCH 2/6] fix self.tffytfp in async interpreter --- software/source/server/async_interpreter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/software/source/server/async_interpreter.py b/software/source/server/async_interpreter.py index 7b3f6ae..0cd2ef7 100644 --- a/software/source/server/async_interpreter.py +++ b/software/source/server/async_interpreter.py @@ -132,8 +132,8 @@ class AsyncInterpreter: # Experimental: The AI voice sounds better with replacements like these, but it should happen at the TTS layer # content = content.replace(". ", ". ... ").replace(", ", ", ... ").replace("!", "! ... ").replace("?", "? ... ") # print("yielding ", content) - if self.time_from_first_yield_to_first_put is None: - self.time_from_first_yield_to_first_put = time.time() + if self.tffytfp is None: + self.tffytfp = time.time() yield content From 0e68bb7125d063b170c2e9cdcc927481157771e6 Mon Sep 17 00:00:00 2001 From: Ben Xu Date: Mon, 24 Jun 2024 13:00:27 -0400 Subject: [PATCH 3/6] add docs fixes for esp32 and async interpreter --- README.md | 2 +- software/source/server/async_interpreter.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ebd8488..c14c382 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ If you want to run local speech-to-text using Whisper, you must install Rust. Fo To customize the behavior of the system, edit the [system message, model, skills library path,](https://docs.openinterpreter.com/settings/all-settings) etc. in the `profiles` directory under the `server` directory. This file sets up an interpreter, and is powered by Open Interpreter. -To specify the text-to-speech service for the 01 `base_device.py`, set `interpreter.tts` to either "openai" for OpenAI, "elevenlabs" for ElevenLabs, or "coqui" for Coqui (local) in a profile. For the 01 Light, set `SPEAKER_SAMPLE_RATE` to 24000 for Coqui (local) or 22050 for OpenAI TTS. We currently don't support ElevenLabs TTS on the 01 Light. +To specify the text-to-speech service for the 01 `base_device.py`, set `interpreter.tts` to either "openai" for OpenAI, "elevenlabs" for ElevenLabs, or "coqui" for Coqui (local) in a profile. For the 01 Light, set `SPEAKER_SAMPLE_RATE` in `client.ino` under the `esp32` client directory to 24000 for Coqui (local) or 22050 for OpenAI TTS. We currently don't support ElevenLabs TTS on the 01 Light. ## Ubuntu Dependencies diff --git a/software/source/server/async_interpreter.py b/software/source/server/async_interpreter.py index 0cd2ef7..d0bac0a 100644 --- a/software/source/server/async_interpreter.py +++ b/software/source/server/async_interpreter.py @@ -25,6 +25,7 @@ class AsyncInterpreter: self.stt_latency = None self.tts_latency = None self.interpreter_latency = None + # time from first put to first yield self.tffytfp = None self.debug = debug From fda23e95b22a679f6b1197c3d1f78d9caece8521 Mon Sep 17 00:00:00 2001 From: killian <63927363+KillianLucas@users.noreply.github.com> Date: Wed, 10 Jul 2024 10:56:54 -0700 Subject: [PATCH 4/6] Implemented `profiles` --- software/source/server/async_server.py | 34 ++++++++----------- software/source/server/tunnel.py | 9 +++-- software/start.py | 47 ++++++++++++++++++++++++-- 3 files changed, 66 insertions(+), 24 deletions(-) diff --git a/software/source/server/async_server.py b/software/source/server/async_server.py index 8bb91a3..849f72d 100644 --- a/software/source/server/async_server.py +++ b/software/source/server/async_server.py @@ -1,14 +1,3 @@ -# import from the profiles directory the interpreter to be served - -# add other profiles to the directory to define other interpreter instances and import them here -# {.profiles.fast: optimizes for STT/TTS latency with the fastest models } -# {.profiles.local: uses local models and local STT/TTS } -# {.profiles.default: uses default interpreter settings with optimized TTS latency } - -# from .profiles.fast import interpreter as base_interpreter -# from .profiles.local import interpreter as base_interpreter -from .profiles.default import interpreter as base_interpreter - import asyncio import traceback import json @@ -19,6 +8,7 @@ from .async_interpreter import AsyncInterpreter from fastapi.middleware.cors import CORSMiddleware from typing import List, Dict, Any import os +import importlib.util os.environ["STT_RUNNER"] = "server" os.environ["TTS_RUNNER"] = "server" @@ -50,14 +40,6 @@ async def websocket_endpoint( ): await websocket.accept() - # interpreter.tts set in the profiles directory!!!! - interpreter = AsyncInterpreter(base_interpreter, debug) - - # Send the tts_service value to the client - await websocket.send_text( - json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts}) - ) - try: async def receive_input(): @@ -98,9 +80,21 @@ async def websocket_endpoint( await websocket.close() -async def main(server_host, server_port, debug): +async def main(server_host, server_port, profile, debug): + app.state.debug = debug + # Load the profile module from the provided path + spec = importlib.util.spec_from_file_location("profile", profile) + profile_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(profile_module) + + # Get the interpreter from the profile + interpreter = profile_module.interpreter + + # Make it async + interpreter = AsyncInterpreter(interpreter, debug) + print(f"Starting server on {server_host}:{server_port}") config = Config(app, host=server_host, port=server_port, lifespan="on") server = Server(config) diff --git a/software/source/server/tunnel.py b/software/source/server/tunnel.py index f25a0b3..a40c0f3 100644 --- a/software/source/server/tunnel.py +++ b/software/source/server/tunnel.py @@ -6,7 +6,7 @@ from ..utils.print_markdown import print_markdown def create_tunnel( - tunnel_method="ngrok", server_host="localhost", server_port=10001, qr=False + tunnel_method="ngrok", server_host="localhost", server_port=10001, qr=False, domain=None ): print_markdown("Exposing server to the internet...") @@ -99,8 +99,13 @@ def create_tunnel( # If ngrok is installed, start it on the specified port # process = subprocess.Popen(f'ngrok http {server_port} --log=stdout', shell=True, stdout=subprocess.PIPE) + + if domain: + domain = f"--domain={domain}" + else: + domain = "" process = subprocess.Popen( - f"ngrok http {server_port} --scheme http,https --log=stdout", + f"ngrok http {server_port} --scheme http,https {domain} --log=stdout", shell=True, stdout=subprocess.PIPE, ) diff --git a/software/start.py b/software/start.py index 0808b0f..28c5675 100644 --- a/software/start.py +++ b/software/start.py @@ -6,6 +6,7 @@ import os import importlib from source.server.tunnel import create_tunnel from source.server.async_server import main +import subprocess import signal @@ -41,11 +42,25 @@ def run( qr: bool = typer.Option( False, "--qr", help="Display QR code to scan to connect to the server" ), + domain: str = typer.Option( + None, "--domain", help="Connect ngrok to a custom domain" + ), + profiles: bool = typer.Option( + False, + "--profiles", + help="Opens the folder where this script is contained", + ), + profile: str = typer.Option( + "default.py", # default + "--profile", + help="Specify the path to the profile, or the name of the file if it's in the `profiles` directory (run `--profiles` to open the profiles directory)", + ), debug: bool = typer.Option( False, "--debug", help="Print latency measurements and save microphone recordings locally for manual playback.", ), + ): _run( server=server, @@ -58,6 +73,9 @@ def run( client_type=client_type, qr=qr, debug=debug, + domain=domain, + profiles=profiles, + profile=profile, ) @@ -72,8 +90,33 @@ def _run( client_type: str = "auto", qr: bool = False, debug: bool = False, + domain = None, + profiles = None, + profile = None, ): + profiles_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "source", "server", "profiles") + + if profiles: + if platform.system() == "Windows": + subprocess.Popen(['explorer', profiles_dir]) + elif platform.system() == "Darwin": + subprocess.Popen(['open', profiles_dir]) + elif platform.system() == "Linux": + subprocess.Popen(['xdg-open', profiles_dir]) + else: + subprocess.Popen(['open', profiles_dir]) + exit(0) + + if profile: + if not os.path.isfile(profile): + profile = os.path.join(profiles_dir, profile) + if not os.path.isfile(profile): + profile += ".py" + if not os.path.isfile(profile): + print(f"Invalid profile path: {profile}") + exit(1) + system_type = platform.system() if system_type == "Windows": server_host = "localhost" @@ -91,7 +134,6 @@ def _run( signal.signal(signal.SIGINT, handle_exit) if server: - # print(f"Starting server with mobile = {mobile}") loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) server_thread = threading.Thread( @@ -100,6 +142,7 @@ def _run( main( server_host, server_port, + profile, debug, ), ), @@ -108,7 +151,7 @@ def _run( if expose: tunnel_thread = threading.Thread( - target=create_tunnel, args=[tunnel_service, server_host, server_port, qr] + target=create_tunnel, args=[tunnel_service, server_host, server_port, qr, domain] ) tunnel_thread.start() From d13c0cf3a4303b3838f494397afd7d2d296de57a Mon Sep 17 00:00:00 2001 From: killian <63927363+KillianLucas@users.noreply.github.com> Date: Wed, 10 Jul 2024 11:08:37 -0700 Subject: [PATCH 5/6] Open Interpreter compatible `--profiles` --- software/source/server/async_server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/software/source/server/async_server.py b/software/source/server/async_server.py index 849f72d..0772301 100644 --- a/software/source/server/async_server.py +++ b/software/source/server/async_server.py @@ -92,6 +92,10 @@ async def main(server_host, server_port, profile, debug): # Get the interpreter from the profile interpreter = profile_module.interpreter + if not hasattr(interpreter, 'tts'): + print("Setting TTS provider to default: openai") + interpreter.tts = "openai" + # Make it async interpreter = AsyncInterpreter(interpreter, debug) From d8d7658f8a9b3c969efe77628cabde6b6b3271b0 Mon Sep 17 00:00:00 2001 From: killian <63927363+KillianLucas@users.noreply.github.com> Date: Wed, 10 Jul 2024 11:14:27 -0700 Subject: [PATCH 6/6] Restored sending TTS service to client --- software/source/server/async_server.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/software/source/server/async_server.py b/software/source/server/async_server.py index 0772301..3897f56 100644 --- a/software/source/server/async_server.py +++ b/software/source/server/async_server.py @@ -40,6 +40,11 @@ async def websocket_endpoint( ): await websocket.accept() + # Send the tts_service value to the client + await websocket.send_text( + json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts}) + ) + try: async def receive_input():