diff --git a/.gitignore b/.gitignore index 91c9774..ffe6b33 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ ggml-*.bin - +OS/01/local_tts/* # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/OS/01/device.py b/OS/01/device.py index 154cc66..edf7a60 100644 --- a/OS/01/device.py +++ b/OS/01/device.py @@ -1,7 +1,6 @@ import asyncio import threading import os -import logging import pyaudio from starlette.websockets import WebSocket from queue import Queue @@ -21,8 +20,9 @@ from utils.kernel import put_kernel_messages_into_queue from stt import stt_wav import asyncio -# Configure logging -logging.basicConfig(format='%(message)s', level=logging.getLevelName(os.getenv('DEBUG_LEVEL', 'INFO').upper())) +from utils.logs import setup_logging +from utils.logs import logger +setup_logging() # Configuration for Audio Recording CHUNK = 1024 # Record in chunks of 1024 samples @@ -49,7 +49,7 @@ def record_audio(): """Record audio from the microphone and add it to the queue.""" stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) - logging.info("Recording started...") + logger.info("Recording started...") global RECORDING # Create a temporary WAV file to store the audio data @@ -67,7 +67,7 @@ def record_audio(): wav_file.close() stream.stop_stream() stream.close() - logging.info("Recording stopped.") + logger.info("Recording stopped.") duration = wav_file.getnframes() / RATE if duration < 0.3: @@ -118,7 +118,7 @@ def on_release(key): if key == keyboard.Key.space: toggle_recording(False) elif key == keyboard.Key.esc: - logging.info("Exiting...") + logger.info("Exiting...") os._exit(0) @@ -133,11 +133,12 @@ async def message_sender(websocket): async def websocket_communication(WS_URL): while True: try: - logging.info(f"Connecting to `{WS_URL}` ...") + logger.info(f"Connecting to `{WS_URL}` ...") headers = {"ngrok-skip-browser-warning": str(80), "User-Agent": "project01"} if os.getenv('NGROK_AUTHTOKEN') else {} async with websockets.connect(WS_URL, extra_headers=headers) as websocket: - logging.info("Press the spacebar to start/stop recording. Press ESC to exit.") + logger.info("Press the spacebar to start/stop recording. Press ESC to exit.") + asyncio.create_task(message_sender(websocket)) initial_message = {"role": None, "type": None, "format": None, "content": None} @@ -146,16 +147,18 @@ async def websocket_communication(WS_URL): while True: message = await websocket.recv() - logging.info(f"Got this message from the server: {type(message)} {message}") + logger.debug(f"Got this message from the server: {type(message)} {message}") if type(message) == str: message = json.loads(message) if message.get("end"): - logging.info(f"Complete message from the server: {message_so_far}") + logger.debug(f"Complete message from the server: {message_so_far}") + logger.info("\n") message_so_far = initial_message if "content" in message: + print(message['content'], end="", flush=True) if any(message_so_far[key] != message[key] for key in message_so_far if key != "content"): message_so_far = message else: @@ -181,6 +184,7 @@ async def websocket_communication(WS_URL): result = interpreter.computer.run(language, code) send_queue.put(result) + except Exception as e: logging.exception(f"An error occurred during websocket communication. {e}") logging.info(f"Connecting to `{WS_URL}`...") diff --git a/OS/01/server.py b/OS/01/server.py index b40c107..39a67b8 100644 --- a/OS/01/server.py +++ b/OS/01/server.py @@ -3,7 +3,6 @@ import ast import json import queue import os -import logging import traceback import re from fastapi import FastAPI @@ -20,8 +19,10 @@ from interpreter import interpreter import ngrok import signal -# Configure logging -logging.basicConfig(format='%(message)s', level=logging.getLevelName(os.getenv('DEBUG_LEVEL', 'INFO').upper())) +from utils.logs import setup_logging +from utils.logs import logger +setup_logging() + app = FastAPI() @@ -65,10 +66,10 @@ if os.getenv('CODE_RUNNER') == "device": to_device.put({"role": "assistant", "type": "code", "format": "python", "end": True}) # Stream the response - logging.info("Waiting for the device to respond...") + logger.info("Waiting for the device to respond...") while True: chunk = from_computer.get() - logging.info(f"Server received from device: {chunk}") + logger.info(f"Server received from device: {chunk}") if "end" in chunk: break yield chunk @@ -99,7 +100,7 @@ async def websocket_endpoint(websocket: WebSocket): await asyncio.gather(receive_task, send_task) except Exception as e: traceback.print_exc() - logging.info(f"Connection lost. Error: {e}") + logger.info(f"Connection lost. Error: {e}") async def receive_messages(websocket: WebSocket): while True: @@ -114,7 +115,7 @@ async def receive_messages(websocket: WebSocket): async def send_messages(websocket: WebSocket): while True: message = await to_device.get() - logging.debug(f"Sending to the device: {type(message)} {message}") + logger.debug(f"Sending to the device: {type(message)} {message}") await websocket.send_json(message) async def listener(): @@ -164,7 +165,7 @@ async def listener(): for chunk in interpreter.chat(messages, stream=True, display=False): - logging.debug("Got chunk:", chunk) + logger.debug("Got chunk:", chunk) # Send it to the user await to_device.put(chunk) @@ -200,7 +201,7 @@ async def listener(): with open(conversation_history_path, 'w') as file: json.dump(interpreter.messages, file, indent=4) - logging.info("New user message recieved. Breaking.") + logger.info("New user message recieved. Breaking.") break # Also check if there's any new computer messages @@ -209,7 +210,7 @@ async def listener(): with open(conversation_history_path, 'w') as file: json.dump(interpreter.messages, file, indent=4) - logging.info("New computer message recieved. Breaking.") + logger.info("New computer message recieved. Breaking.") break else: with open(conversation_history_path, 'w') as file: @@ -249,16 +250,16 @@ if __name__ == "__main__": # Set up Ngrok ngrok_auth_token = os.getenv('NGROK_AUTHTOKEN') if ngrok_auth_token is not None: - logging.info("Setting up Ngrok") + logger.info("Setting up Ngrok") ngrok_listener = await ngrok.forward(f"{parsed_url.hostname}:{parsed_url.port}", authtoken=ngrok_auth_token) ngrok_parsed_url = urllib.parse.urlparse(ngrok_listener.url()) # Setup SERVER_URL environment variable for device to use connection_url = f"wss://{ngrok_parsed_url.hostname}/" - logging.info(f"Ngrok established at {ngrok_parsed_url.geturl()}") - logging.info(f"\033[1mSERVER_CONNECTION_URL should be set to \"{connection_url}\"\033[0m") + logger.info(f"Ngrok established at {ngrok_parsed_url.geturl()}") + logger.info(f"\033[1mSERVER_CONNECTION_URL should be set to \"{connection_url}\"\033[0m") - logging.info("Starting `server.py`...") + logger.info("Starting `server.py`...") config = Config(app, host=parsed_url.hostname, port=parsed_url.port, lifespan='on') server = Server(config) diff --git a/OS/01/start.sh b/OS/01/start.sh index 71bc436..8107a17 100755 --- a/OS/01/start.sh +++ b/OS/01/start.sh @@ -10,6 +10,11 @@ export OPENAI_API_KEY="sk-..." # Uncomment following line with your Ngrok auth token (https://dashboard.ngrok.com/get-started/your-authtoken) # export NGROK_AUTHTOKEN="AUTH_TOKEN" +# For TTS, we use the en_US-lessac-medium voice model by default +# Please change the voice URL and voice name if you wish to use another voice +export PIPER_VOICE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/" +export PIPER_VOICE_NAME="en_US-lessac-medium.onnx" + # If SERVER_START, this is where we'll serve the server. # If DEVICE_START, this is where the device expects the server to be. export SERVER_URL=ws://localhost:8000/ @@ -26,12 +31,52 @@ export STT_RUNNER=device # If server, audio will be sent over websocket. export SERVER_EXPOSE_PUBLICALLY=False # Debug level -# export DEBUG_LEVEL=DEBUG -export DEBUG_LEVEL="INFO" +# export LOG_LEVEL=DEBUG +export LOG_LEVEL="INFO" ### SETUP +# if using local models, install the models / executables +if [[ "$ALL_LOCAL" == "True" ]]; then + OS=$(uname -s) + ARCH=$(uname -m) + if [ "$OS" = "Darwin" ]; then + OS="macos" + if [ "$ARCH" = "arm64" ]; then + ARCH="aarch64" + elif [ "$ARCH" = "x86_64" ]; then + ARCH="x64" + else + echo "Piper: unsupported architecture" + fi + fi + PIPER_ASSETNAME="piper_${OS}_${ARCH}.tar.gz" + PIPER_URL="https://github.com/rhasspy/piper/releases/latest/download/" + mkdir local_tts + cd local_tts + curl -OL "${PIPER_URL}${PIPER_ASSETNAME}" + tar -xvzf $PIPER_ASSETNAME + cd piper + if [ "$OS" = "macos" ]; then + if [ "$ARCH" = "x64" ]; then + softwareupdate --install-rosetta --agree-to-license + fi + PIPER_PHONEMIZE_ASSETNAME="piper-phonemize_${OS}_${ARCH}.tar.gz" + PIPER_PHONEMIZE_URL="https://github.com/rhasspy/piper-phonemize/releases/latest/download/" + + curl -OL "${PIPER_PHONEMIZE_URL}${PIPER_PHONEMIZE_ASSETNAME}" + tar -xvzf $PIPER_PHONEMIZE_ASSETNAME + curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}" + curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}.json" + PIPER_DIR=`pwd` + install_name_tool -change @rpath/libespeak-ng.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libespeak-ng.1.dylib" "${PIPER_DIR}/piper" + install_name_tool -change @rpath/libonnxruntime.1.14.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libonnxruntime.1.14.1.dylib" "${PIPER_DIR}/piper" + install_name_tool -change @rpath/libpiper_phonemize.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libpiper_phonemize.1.dylib" "${PIPER_DIR}/piper" + fi + cd ../.. +fi + # (for dev, reset the ports we were using) SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+") diff --git a/OS/01/stt.py b/OS/01/stt.py index fef4ebb..aa14e24 100644 --- a/OS/01/stt.py +++ b/OS/01/stt.py @@ -4,7 +4,6 @@ Defines a function which takes a path to an audio file and turns it into text. from datetime import datetime import os -import logging import contextlib import tempfile import ffmpeg @@ -12,8 +11,9 @@ import subprocess import openai from openai import OpenAI -# Configure logging -logging.basicConfig(format='%(message)s', level=logging.getLevelName(os.getenv('DEBUG_LEVEL', 'INFO').upper())) +from utils.logs import setup_logging +from utils.logs import logger +setup_logging() client = OpenAI() @@ -85,10 +85,10 @@ def stt_wav(wav_file_path: str): response_format="text" ) except openai.BadRequestError as e: - logging.info(f"openai.BadRequestError: {e}") + logger.info(f"openai.BadRequestError: {e}") return None - logging.info(f"Transcription result: {transcript}") + logger.info(f"Transcription result: {transcript}") return transcript else: temp_dir = tempfile.gettempdir() diff --git a/OS/01/tts.py b/OS/01/tts.py index 024da65..e51972a 100644 --- a/OS/01/tts.py +++ b/OS/01/tts.py @@ -7,20 +7,37 @@ from openai import OpenAI from pydub import AudioSegment from pydub.playback import play from playsound import playsound +import os +import subprocess +import tempfile client = OpenAI() def tts(text, play_audio): - response = client.audio.speech.create( - model="tts-1", - voice="alloy", - input=text, - response_format="mp3" - ) - with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file: - response.stream_to_file(temp_file.name) - - if play_audio: - playsound(temp_file.name) - - return temp_file.read() + if os.getenv('ALL_LOCAL') == 'False': + response = client.audio.speech.create( + model="tts-1", + voice="alloy", + input=text, + response_format="mp3" + ) + with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file: + response.stream_to_file(temp_file.name) + + if play_audio: + playsound(temp_file.name) + + return temp_file.read() + else: + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: + output_file = temp_file.name + piper_dir = os.path.join(os.path.dirname(__file__), 'local_tts', 'piper') + subprocess.run([ + os.path.join(piper_dir, 'piper'), + '--model', os.path.join(piper_dir, os.getenv('PIPER_VOICE_NAME')), + '--output_file', output_file + ], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + if play_audio: + playsound(temp_file.name) + return temp_file.read() diff --git a/OS/01/utils/kernel.py b/OS/01/utils/kernel.py index dc58272..7f1bb3c 100644 --- a/OS/01/utils/kernel.py +++ b/OS/01/utils/kernel.py @@ -1,11 +1,10 @@ import asyncio import subprocess import platform -import os -import logging -# Configure logging -logging.basicConfig(format='%(message)s', level=logging.getLevelName(os.getenv('DEBUG_LEVEL', 'INFO').upper())) +from utils.logs import setup_logging +from utils.logs import logger +setup_logging() def get_kernel_messages(): """ @@ -21,7 +20,7 @@ def get_kernel_messages(): with open('/var/log/dmesg', 'r') as file: return file.read() else: - logging.info("Unsupported platform.") + logger.info("Unsupported platform.") def custom_filter(message): # Check for {TO_INTERPRETER{ message here }TO_INTERPRETER} pattern @@ -33,7 +32,7 @@ def custom_filter(message): elif 'USB' in message: return message # Check for network related keywords - elif any(keyword in message for keyword in ['network', 'IP', 'internet', 'LAN', 'WAN', 'router', 'switch']): + elif any(keyword in message for keyword in ['network', 'IP', 'internet', 'LAN', 'WAN', 'router', 'switch']) and "networkStatusForFlags" not in message: return message else: return None diff --git a/OS/01/utils/logs.py b/OS/01/utils/logs.py new file mode 100644 index 0000000..a73dea1 --- /dev/null +++ b/OS/01/utils/logs.py @@ -0,0 +1,22 @@ +import os +import logging + +logger: logging.Logger = logging.getLogger("01") +root_logger: logging.Logger = logging.getLogger() + + +def _basic_config() -> None: + logging.basicConfig( + format="%(message)s" + ) + + +def setup_logging() -> None: + env = os.environ.get("LOG_LEVEL", "").upper() + if env == "DEBUG": + _basic_config() + logger.setLevel(logging.DEBUG) + root_logger.setLevel(logging.DEBUG) + elif env == "INFO": + _basic_config() + logger.setLevel(logging.INFO) diff --git a/README.md b/README.md index 24057cf..5350605 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ sudo apt-get install portaudio19-dev libav-tools ```bash python -m pip install -r requirements.txt ``` +NB: Depending on your local Python version, you may run into [this issue↗](https://github.com/TaylorSMarks/playsound/issues/150) installing playsound. Workarounds are provided in the issue. If you want to run local speech-to-text from whisper, download the GGML Whisper model from [Huggingface](https://huggingface.co/ggerganov/whisper.cpp). Then in `OS/01/start.sh`, set `ALL_LOCAL=TRUE` and set `WHISPER_MODEL_PATH` to the path of the model. diff --git a/hardware/devices/jetson-nano/README.md b/hardware/devices/jetson-nano/README.md new file mode 100644 index 0000000..600bda4 --- /dev/null +++ b/hardware/devices/jetson-nano/README.md @@ -0,0 +1,22 @@ +# Development Setup for Jetson Nano + +1. Go through the tutorial here: https://developer.nvidia.com/embedded/learn/get-started-jetson-nano-devkit#intro + +2. At the end of that guide, you should have a Jetson running off a power supply or micro USB. + +3. Get network connectivity. The Jetson does not have a WiFi module so you will need to plug in ethernet. + If you have a laptop, you can share internet access over Ethernet. + + To do this with Mac, do the following: + + a. Plug a cable from the Jetson Ethernet port to your Mac (you can use a Ethernet -> USB converter for your Mac). + + b. Go to General->Sharing, then click the little `(i)` icon next to "Internet Sharing", and check all the options. + + ![](mac-share-internet.png) + + c. Go back to General->Sharing, and turn on "Internet Sharing". + + ![](mac-share-internet-v2.png) + + d. Now the Jetson should have connectivity! \ No newline at end of file diff --git a/hardware/devices/jetson-nano/mac-share-internet-v2.png b/hardware/devices/jetson-nano/mac-share-internet-v2.png new file mode 100644 index 0000000..74e1de4 Binary files /dev/null and b/hardware/devices/jetson-nano/mac-share-internet-v2.png differ diff --git a/hardware/devices/jetson-nano/mac-share-internet.png b/hardware/devices/jetson-nano/mac-share-internet.png new file mode 100644 index 0000000..51aaa5d Binary files /dev/null and b/hardware/devices/jetson-nano/mac-share-internet.png differ