From 6f84f5a6867db25cca7458362cc7d5642faceaeb Mon Sep 17 00:00:00 2001 From: Shiven Mian Date: Sat, 3 Feb 2024 17:52:13 -0800 Subject: [PATCH 1/5] fix: commented out rpio requirement --- OS/01/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/OS/01/requirements.txt b/OS/01/requirements.txt index daef207..83d21e0 100644 --- a/OS/01/requirements.txt +++ b/OS/01/requirements.txt @@ -2,4 +2,6 @@ git+https://github.com/KillianLucas/open-interpreter.git redis fastapi uvicorn -RPi.GPIO \ No newline at end of file +websockets +python-dotenv +ffmpeg-python \ No newline at end of file From a3de4c1286a2a66ab736ed72645d177103236407 Mon Sep 17 00:00:00 2001 From: Shiven Mian Date: Sat, 3 Feb 2024 17:52:26 -0800 Subject: [PATCH 2/5] fix: add gitignore --- OS/01/.gitignore | 160 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 OS/01/.gitignore diff --git a/OS/01/.gitignore b/OS/01/.gitignore new file mode 100644 index 0000000..6769e21 --- /dev/null +++ b/OS/01/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file From f749cb878edb9244b84b8784c38e6c3aaf7f79f5 Mon Sep 17 00:00:00 2001 From: Shiven Mian Date: Sat, 3 Feb 2024 18:56:06 -0800 Subject: [PATCH 3/5] feat: added Whisper stt --- OS/01/assistant/listen.py | 52 ++++++++++++++ OS/01/assistant/stt.py | 52 ++++++++++++++ OS/01/requirements.txt | 3 + OS/01/user/record.py | 141 ++++++++++++++++++++++++++++++++++++++ OS/01/user/user.py | 13 ---- 5 files changed, 248 insertions(+), 13 deletions(-) create mode 100644 OS/01/assistant/listen.py create mode 100644 OS/01/user/record.py delete mode 100644 OS/01/user/user.py diff --git a/OS/01/assistant/listen.py b/OS/01/assistant/listen.py new file mode 100644 index 0000000..948ef9a --- /dev/null +++ b/OS/01/assistant/listen.py @@ -0,0 +1,52 @@ +from fastapi import FastAPI, WebSocket +import uvicorn +import json +from stt import stt +import tempfile + +app = FastAPI() + +@app.websocket("/user") +async def user(ws: WebSocket): + await ws.accept() + audio_file = bytearray() + mime_type = None + + try: + while True: + message = await ws.receive() + + if message['type'] == 'websocket.disconnect': + break + + if message['type'] == 'websocket.receive': + if 'text' in message: + control_message = json.loads(message['text']) + if control_message.get('action') == 'command' and control_message.get('state') == 'start' and 'mimeType' in control_message: + # This indicates the start of a new audio file + mime_type = control_message.get('mimeType') + elif control_message.get('action') == 'command' and control_message.get('state') == 'end': + # This indicates the end of the audio file + # Process the complete audio file here + transcription = stt(audio_file, mime_type) + await ws.send_json({"transcript": transcription}) + + print("SENT TRANSCRIPTION!") + + # Reset the bytearray for the next audio file + audio_file = bytearray() + mime_type = None + elif 'bytes' in message: + # If it's not a control message, it's part of the audio file + audio_file.extend(message['bytes']) + + except Exception as e: + print(f"WebSocket connection closed with exception: {e}") + finally: + await ws.close() + print("WebSocket connection closed") + + +if __name__ == "__main__": + with tempfile.TemporaryDirectory(): + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/OS/01/assistant/stt.py b/OS/01/assistant/stt.py index e69de29..d52f260 100644 --- a/OS/01/assistant/stt.py +++ b/OS/01/assistant/stt.py @@ -0,0 +1,52 @@ +from datetime import datetime +import os +import contextlib +import tempfile +import ffmpeg +import subprocess + +from openai import OpenAI +client = OpenAI() + +def convert_mime_type_to_format(mime_type: str) -> str: + if mime_type == "audio/x-wav" or mime_type == "audio/wav": + return "wav" + if mime_type == "audio/webm": + return "webm" + + return mime_type + +@contextlib.contextmanager +def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str: + temp_dir = tempfile.gettempdir() + + # Create a temporary file with the appropriate extension + input_ext = convert_mime_type_to_format(mime_type) + input_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.{input_ext}") + with open(input_path, 'wb') as f: + f.write(audio) + + # Export to wav + output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav") + ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run() + + print(f"Temporary file path: {output_path}") + + try: + yield output_path + finally: + os.remove(input_path) + os.remove(output_path) + + +def stt(audio_bytes: bytearray, mime_type): + with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path: + audio_file = open(wav_file_path, "rb") + transcript = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="text" + ) + + print("Exciting transcription result:", transcript) + return transcript diff --git a/OS/01/requirements.txt b/OS/01/requirements.txt index 83d21e0..4500632 100644 --- a/OS/01/requirements.txt +++ b/OS/01/requirements.txt @@ -1,4 +1,7 @@ git+https://github.com/KillianLucas/open-interpreter.git +asyncio +pyaudio +pynput redis fastapi uvicorn diff --git a/OS/01/user/record.py b/OS/01/user/record.py new file mode 100644 index 0000000..f376e49 --- /dev/null +++ b/OS/01/user/record.py @@ -0,0 +1,141 @@ +""" +Handles everything the user interacts through. + +Connects to a websocket at /user. Sends shit to it, and displays/plays the shit it sends back. + +For now, just handles a spacebar being pressed— for the duration it's pressed, +it should record audio. + +SIMPLEST POSSIBLE: Sends that audio to OpenAI whisper, gets the transcript, +sends it to /user in LMC format (role: user, etc) + +MOST FUTUREPROOF: Streams chunks of audio to /user, which will then handle stt in stt.py. +""" + +import os +import pyaudio +import threading +import asyncio +import websockets +import json +from pynput import keyboard +import wave +import tempfile +from datetime import datetime + +# Configuration +chunk = 1024 # Record in chunks of 1024 samples +sample_format = pyaudio.paInt16 # 16 bits per sample +channels = 1 # Stereo +fs = 48000 # Sample rate + +p = pyaudio.PyAudio() # Create an interface to PortAudio +frames = [] # Initialize array to store frames +recording = False # Flag to control recording state + +ws_chunk_size = 4096 # Websocket stream chunk size + +async def start_recording(): + global recording + + if recording: + return # Avoid multiple starts + recording = True + frames.clear() # Clear existing frames + + stream = p.open(format=sample_format, + channels=channels, + rate=fs, + frames_per_buffer=chunk, + input=True) + + print("Recording started...") + async with websockets.connect("ws://localhost:8000/user") as websocket: + # Send the start command with mime type + await websocket.send(json.dumps({"action": "command", "state": "start", "mimeType": "audio/wav"})) + while recording: + data = stream.read(chunk) + frames.append(data) + + stream.stop_stream() + stream.close() + + try: + file_path = save_recording(frames) + with open(file_path, 'rb') as audio_file: + byte_chunk = audio_file.read(ws_chunk_size) + while byte_chunk: + await websocket.send(byte_chunk) + byte_chunk = audio_file.read(ws_chunk_size) + finally: + os.remove(file_path) + + # Send the end command + await websocket.send(json.dumps({"action": "command", "state": "end"})) + + # Receive a json message and then close the connection + message = await websocket.recv() + print("Received message:", json.loads(message)) + + print("Recording stopped.") + +def save_recording(frames) -> str: + # Save the recorded data as a WAV file + temp_dir = tempfile.gettempdir() + + # Create a temporary file with the appropriate extension + output_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav") + with wave.open(output_path, 'wb') as wf: + wf.setnchannels(channels) + wf.setsampwidth(p.get_sample_size(sample_format)) + wf.setframerate(fs) + wf.writeframes(b''.join(frames)) + + return output_path + +def start_recording_sync(): + # Create a new event loop for the thread + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + # Run the asyncio event loop + loop.run_until_complete(start_recording()) + loop.close() + +def stop_recording(): + global recording + recording = False + print("Stopped recording") + +def toggle_recording(): + global recording + if recording: + stop_recording() + else: + # Start recording in a new thread to avoid blocking + print("Starting recording") + threading.Thread(target=start_recording_sync).start() + +is_space_pressed = False # Flag to track the state of the spacebar + +def on_press(key): + global is_space_pressed + if key == keyboard.Key.space and not is_space_pressed: + is_space_pressed = True + toggle_recording() + +def on_release(key): + global is_space_pressed + if key == keyboard.Key.space and is_space_pressed: + is_space_pressed = False + stop_recording() + if key == keyboard.Key.esc: + # Stop listener + return False + +# Collect events until released +with keyboard.Listener(on_press=on_press, on_release=on_release) as listener: + with tempfile.TemporaryDirectory(): + print("Press the spacebar to start/stop recording. Press ESC to exit.") + listener.join() + +p.terminate() \ No newline at end of file diff --git a/OS/01/user/user.py b/OS/01/user/user.py deleted file mode 100644 index ee3529f..0000000 --- a/OS/01/user/user.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -Handles everything the user interacts through. - -Connects to a websocket at /user. Sends shit to it, and displays/plays the shit it sends back. - -For now, just handles a spacebar being pressed— for the duration it's pressed, -it should record audio. - -SIMPLEST POSSIBLE: Sends that audio to OpenAI whisper, gets the transcript, -sends it to /user in LMC format (role: user, etc) - -MOST FUTUREPROOF: Streams chunks of audio to /user, which will then handle stt in stt.py. -""" \ No newline at end of file From 7a95ce4dd831a7f8599f4bf595057ab252497470 Mon Sep 17 00:00:00 2001 From: Shiven Mian Date: Sat, 3 Feb 2024 19:25:50 -0800 Subject: [PATCH 4/5] chore: add listen docstring --- OS/01/assistant/listen.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/OS/01/assistant/listen.py b/OS/01/assistant/listen.py index 948ef9a..44a089b 100644 --- a/OS/01/assistant/listen.py +++ b/OS/01/assistant/listen.py @@ -1,3 +1,8 @@ +""" +Listens to chunks of audio recorded by user. +Run `python listen.py` to start the server, then `cd user` and run `python record.py` to record audio. +""" + from fastapi import FastAPI, WebSocket import uvicorn import json From 5f1be31562a00f233c9ee2d53efdc4461b5d75d1 Mon Sep 17 00:00:00 2001 From: Shiven Mian Date: Sat, 3 Feb 2024 19:29:03 -0800 Subject: [PATCH 5/5] fix: add back user.py docstring --- OS/01/user/user.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 OS/01/user/user.py diff --git a/OS/01/user/user.py b/OS/01/user/user.py new file mode 100644 index 0000000..ee3529f --- /dev/null +++ b/OS/01/user/user.py @@ -0,0 +1,13 @@ +""" +Handles everything the user interacts through. + +Connects to a websocket at /user. Sends shit to it, and displays/plays the shit it sends back. + +For now, just handles a spacebar being pressed— for the duration it's pressed, +it should record audio. + +SIMPLEST POSSIBLE: Sends that audio to OpenAI whisper, gets the transcript, +sends it to /user in LMC format (role: user, etc) + +MOST FUTUREPROOF: Streams chunks of audio to /user, which will then handle stt in stt.py. +""" \ No newline at end of file