diff --git a/software/source/clients/base_device.py b/software/source/clients/base_device.py index 923ee9e..b713601 100644 --- a/software/source/clients/base_device.py +++ b/software/source/clients/base_device.py @@ -152,6 +152,15 @@ class Device: async def play_audiosegments(self): """Plays them sequentially.""" + + mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"] + mpv_process = subprocess.Popen( + mpv_command, + stdin=subprocess.PIPE, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + while True: try: audio = await self.audiosegments.get() @@ -161,6 +170,10 @@ class Device: print(f"Time from request to playback: {elapsed_time} seconds") self.playback_latency = None + if audio is not None: + mpv_process.stdin.write(audio) # type: ignore + mpv_process.stdin.flush() # type: ignore + """ args = ["ffplay", "-autoexit", "-", "-nodisp"] proc = subprocess.Popen( args=args, @@ -171,7 +184,8 @@ class Device: out, err = proc.communicate(input=audio) proc.poll() - # play(audio) + play(audio) + """ # self.audiosegments.remove(audio) # await asyncio.sleep(0.1) except asyncio.exceptions.CancelledError: @@ -391,7 +405,7 @@ class Device: # signed 16-bit little-endian format sample_width=2, # 24,000 Hz frame rate - frame_rate=24000, + frame_rate=16000, # mono sound channels=1, ) @@ -400,6 +414,8 @@ class Device: # print("audio segment was created") await self.audiosegments.put(audio_bytes) + # await self.audiosegments.put(audio) + # Run the code if that's the client's job if os.getenv("CODE_RUNNER") == "client": if message["type"] == "code" and "end" in message: @@ -434,7 +450,8 @@ class Device: async def start_async(self): print("start async was called!!!!!") # Configuration for WebSocket - WS_URL = f"ws://{self.server_url}/ws" + WS_URL = f"ws://{self.server_url}" + # Start the WebSocket communication asyncio.create_task(self.websocket_communication(WS_URL)) diff --git a/software/source/server/async_interpreter.py b/software/source/server/async_interpreter.py index b4a11d3..209ff73 100644 --- a/software/source/server/async_interpreter.py +++ b/software/source/server/async_interpreter.py @@ -24,6 +24,7 @@ from RealtimeSTT import AudioToTextRecorder import time import asyncio import json +import os class AsyncInterpreter: @@ -47,9 +48,7 @@ class AsyncInterpreter: elif self.interpreter.tts == "gtts": engine = GTTSEngine() elif self.interpreter.tts == "elevenlabs": - engine = ElevenlabsEngine( - api_key="sk_077cb1cabdf67e62b85f8782e66e5d8e11f78b450c7ce171" - ) + engine = ElevenlabsEngine(api_key=os.environ["ELEVEN_LABS_API_KEY"]) elif self.interpreter.tts == "system": engine = SystemEngine() else: @@ -96,12 +95,12 @@ class AsyncInterpreter: pass if "start" in chunk: - print("input received") + # print("Starting STT") self.stt.start() self._last_lmc_start_flag = time.time() # self.interpreter.computer.terminal.stop() # Stop any code execution... maybe we should make interpreter.stop()? elif "end" in chunk: - print("running oi on input now") + # print("Running OI on input") asyncio.create_task(self.run()) else: await self._add_to_queue(self._input_queue, chunk) @@ -139,6 +138,7 @@ class AsyncInterpreter: print("STT LATENCY", self.stt_latency) # print(message) + end_interpreter = 0 # print(message) def generate(message): @@ -165,7 +165,7 @@ class AsyncInterpreter: # Experimental: The AI voice sounds better with replacements like these, but it should happen at the TTS layer # content = content.replace(". ", ". ... ").replace(", ", ", ... ").replace("!", "! ... ").replace("?", "? ... ") - print("yielding this", content) + # print("yielding this", content) yield content # Handle code blocks @@ -214,6 +214,7 @@ class AsyncInterpreter: while True: if self.tts.is_playing(): start_tts = time.time() + break await asyncio.sleep(0.1) while True: @@ -231,6 +232,7 @@ class AsyncInterpreter: end_tts = time.time() self.tts_latency = end_tts - start_tts print("TTS LATENCY", self.tts_latency) + self.tts.stop() break async def _on_tts_chunk_async(self, chunk): diff --git a/software/source/server/async_server.py b/software/source/server/async_server.py index ff8a466..cf80ea1 100644 --- a/software/source/server/async_server.py +++ b/software/source/server/async_server.py @@ -2,6 +2,7 @@ import asyncio import traceback import json from fastapi import FastAPI, WebSocket, Header +from fastapi.responses import PlainTextResponse from uvicorn import Config, Server from interpreter import interpreter as base_interpreter from .async_interpreter import AsyncInterpreter @@ -18,38 +19,25 @@ base_interpreter.system_message = ( "You are a helpful assistant that can answer questions and help with tasks." ) base_interpreter.computer.import_computer_api = False -base_interpreter.llm.model = "groq/mixtral-8x7b-32768" -base_interpreter.llm.api_key = ( - "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" -) +base_interpreter.llm.model = "groq/llama3-8b-8192" +base_interpreter.llm.api_key = os.environ["GROQ_API_KEY"] +print(base_interpreter.llm.api_key) base_interpreter.llm.supports_functions = False +base_interpreter.auto_run = True os.environ["STT_RUNNER"] = "server" os.environ["TTS_RUNNER"] = "server" # Parse command line arguments for port number +""" parser = argparse.ArgumentParser(description="FastAPI server.") parser.add_argument("--port", type=int, default=8000, help="Port to run on.") args = parser.parse_args() - +""" base_interpreter.tts = "elevenlabs" -async def main(): - """ - sentry_sdk.init( - dsn="https://a1465f62a31c7dfb23e1616da86341e9@o4506046614667264.ingest.us.sentry.io/4507374662385664", - enable_tracing=True, - # Set traces_sample_rate to 1.0 to capture 100% - # of transactions for performance monitoring. - traces_sample_rate=1.0, - # Set profiles_sample_rate to 1.0 to profile 100% - # of sampled transactions. - # We recommend adjusting this value in production. - profiles_sample_rate=1.0, - ) - """ - +async def main(server_host, server_port): interpreter = AsyncInterpreter(base_interpreter) app = FastAPI() @@ -62,6 +50,10 @@ async def main(): allow_headers=["*"], # Allow all headers ) + @app.get("/ping") + async def ping(): + return PlainTextResponse("pong") + @app.post("/load_chat") async def load_chat(messages: List[Dict[str, Any]]): interpreter.interpreter.messages = messages @@ -69,7 +61,7 @@ async def main(): print("🪼🪼🪼🪼🪼🪼 Messages loaded: ", interpreter.active_chat_messages) return {"status": "success"} - @app.websocket("/ws") + @app.websocket("/") async def websocket_endpoint(websocket: WebSocket): await websocket.accept() try: @@ -85,7 +77,7 @@ async def main(): await interpreter.input(data) elif "bytes" in data: await interpreter.input(data["bytes"]) - # print("SERVER FEEDING AUDIO") + # print("RECEIVED INPUT", data) elif "text" in data: # print("RECEIVED INPUT", data) await interpreter.input(data["text"]) @@ -111,7 +103,8 @@ async def main(): if not websocket.client_state == "DISCONNECTED": await websocket.close() - config = Config(app, host="0.0.0.0", port=8000, lifespan="on") + print(f"Starting server on {server_host}:{server_port}") + config = Config(app, host=server_host, port=server_port, lifespan="on") server = Server(config) await server.serve() diff --git a/software/start.py b/software/start.py index 5f99fef..7c5186e 100644 --- a/software/start.py +++ b/software/start.py @@ -22,7 +22,7 @@ def run( help="Specify the server host where the server will deploy", ), server_port: int = typer.Option( - 8000, + 10001, "--server-port", help="Specify the server port where the server will deploy", ), @@ -152,8 +152,8 @@ def _run( target=loop.run_until_complete, args=( main( - # server_host, - # server_port, + server_host, + server_port, # llm_service, # model, # llm_supports_vision, @@ -196,7 +196,7 @@ def _run( module = importlib.import_module( f".clients.{client_type}.device", package="source" ) - server_url = "0.0.0.0:8000" + # server_url = "0.0.0.0:8000" client_thread = threading.Thread(target=module.main, args=[server_url]) print("client thread started") client_thread.start()