Merge pull request #6 from shivenmian/u/shivenmian/user

feat: add user.py with record + whisper stt
2 years ago · 6e130d42ea
parent 50a8420cb7 5f1be31562
commit 6e130d42ea
5 changed files with 416 additions and 3 deletions
--- a/OS/01/.gitignore
+++ b/OS/01/.gitignore
@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/OS/01/assistant/listen.py
+++ b/OS/01/assistant/listen.py
@ -0,0 +1,57 @@
+"""
+Listens to chunks of audio recorded by user.
+Run `python listen.py` to start the server, then `cd user` and run `python record.py` to record audio.
+"""
+
+from fastapi import FastAPI, WebSocket
+import uvicorn
+import json
+from stt import stt
+import tempfile
+
+app = FastAPI()
+
+@app.websocket("/user")
+async def user(ws: WebSocket):
+    await ws.accept()
+    audio_file = bytearray()
+    mime_type = None
+
+    try:
+        while True:
+            message = await ws.receive()
+
+            if message['type'] == 'websocket.disconnect':
+                break
+
+            if message['type'] == 'websocket.receive':
+                if 'text' in message:
+                    control_message = json.loads(message['text'])
+                    if control_message.get('action') == 'command' and control_message.get('state') == 'start' and 'mimeType' in control_message:
+                        # This indicates the start of a new audio file
+                        mime_type = control_message.get('mimeType')
+                    elif control_message.get('action') == 'command' and control_message.get('state') == 'end':
+                        # This indicates the end of the audio file
+                        # Process the complete audio file here
+                        transcription = stt(audio_file, mime_type)
+                        await ws.send_json({"transcript": transcription})
+                        
+                        print("SENT TRANSCRIPTION!")
+
+                        # Reset the bytearray for the next audio file
+                        audio_file = bytearray()
+                        mime_type = None
+                elif 'bytes' in message:
+                    # If it's not a control message, it's part of the audio file
+                    audio_file.extend(message['bytes'])
+                    
+    except Exception as e:
+        print(f"WebSocket connection closed with exception: {e}")
+    finally:
+        await ws.close()
+        print("WebSocket connection closed")
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory():
+        uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/OS/01/assistant/stt.py
+++ b/OS/01/assistant/stt.py
@ -2,5 +2,55 @@
 Defines a function which takes a path to an audio file and turns it into text.
 """

-def stt(path_to_audio):
-    return text
+from datetime import datetime
+import os
+import contextlib
+import tempfile
+import ffmpeg
+import subprocess
+
+from openai import OpenAI
+client = OpenAI()
+
+def convert_mime_type_to_format(mime_type: str) -> str:
+    if mime_type == "audio/x-wav" or mime_type == "audio/wav":
+        return "wav"
+    if mime_type == "audio/webm":
+        return "webm"
+
+    return mime_type
+
+@contextlib.contextmanager
+def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
+    temp_dir = tempfile.gettempdir()
+
+    # Create a temporary file with the appropriate extension
+    input_ext = convert_mime_type_to_format(mime_type)
+    input_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.{input_ext}")
+    with open(input_path, 'wb') as f:
+        f.write(audio)
+
+    # Export to wav
+    output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
+    ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
+
+    print(f"Temporary file path: {output_path}")
+
+    try:
+        yield output_path
+    finally:
+        os.remove(input_path)
+        os.remove(output_path)
+
+
+def stt(audio_bytes: bytearray, mime_type):
+    with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path:
+        audio_file = open(wav_file_path, "rb")
+        transcript = client.audio.transcriptions.create(
+            model="whisper-1", 
+            file=audio_file,
+            response_format="text"
+        )
+
+        print("Exciting transcription result:", transcript)
+        return transcript
--- a/OS/01/requirements.txt
+++ b/OS/01/requirements.txt
@ -1,5 +1,10 @@
 git+https://github.com/KillianLucas/open-interpreter.git
+asyncio
+pyaudio
+pynput
 redis
 fastapi
 uvicorn
-RPi.GPIO
+websockets
+python-dotenv
+ffmpeg-python
--- a/OS/01/user/record.py
+++ b/OS/01/user/record.py
@ -0,0 +1,141 @@
+"""
+Handles everything the user interacts through.
+
+Connects to a websocket at /user. Sends shit to it, and displays/plays the shit it sends back.
+
+For now, just handles a spacebar being pressed— for the duration it's pressed,
+it should record audio.
+
+SIMPLEST POSSIBLE: Sends that audio to OpenAI whisper, gets the transcript,
+sends it to /user in LMC format (role: user, etc)
+
+MOST FUTUREPROOF: Streams chunks of audio to /user, which will then handle stt in stt.py.
+"""
+
+import os
+import pyaudio
+import threading
+import asyncio
+import websockets
+import json
+from pynput import keyboard
+import wave
+import tempfile
+from datetime import datetime
+
+# Configuration
+chunk = 1024  # Record in chunks of 1024 samples
+sample_format = pyaudio.paInt16  # 16 bits per sample
+channels = 1  # Stereo
+fs = 48000 # Sample rate
+
+p = pyaudio.PyAudio()  # Create an interface to PortAudio
+frames = []  # Initialize array to store frames
+recording = False  # Flag to control recording state
+
+ws_chunk_size = 4096 # Websocket stream chunk size
+
+async def start_recording():
+    global recording
+
+    if recording:
+        return  # Avoid multiple starts
+    recording = True
+    frames.clear()  # Clear existing frames
+
+    stream = p.open(format=sample_format,
+                    channels=channels,
+                    rate=fs,
+                    frames_per_buffer=chunk,
+                    input=True)
+
+    print("Recording started...")
+    async with websockets.connect("ws://localhost:8000/user") as websocket:
+        # Send the start command with mime type
+        await websocket.send(json.dumps({"action": "command", "state": "start", "mimeType": "audio/wav"}))
+        while recording:
+            data = stream.read(chunk)
+            frames.append(data)
+
+        stream.stop_stream()
+        stream.close()
+
+        try:
+            file_path = save_recording(frames)
+            with open(file_path, 'rb') as audio_file:
+                byte_chunk = audio_file.read(ws_chunk_size)
+                while byte_chunk:
+                    await websocket.send(byte_chunk)
+                    byte_chunk = audio_file.read(ws_chunk_size)
+        finally:
+            os.remove(file_path)
+
+        # Send the end command
+        await websocket.send(json.dumps({"action": "command", "state": "end"}))
+
+        # Receive a json message and then close the connection
+        message = await websocket.recv()
+        print("Received message:", json.loads(message))
+
+    print("Recording stopped.")
+
+def save_recording(frames) -> str:
+    # Save the recorded data as a WAV file
+    temp_dir = tempfile.gettempdir()
+
+    # Create a temporary file with the appropriate extension
+    output_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
+    with wave.open(output_path, 'wb') as wf:
+        wf.setnchannels(channels)
+        wf.setsampwidth(p.get_sample_size(sample_format))
+        wf.setframerate(fs)
+        wf.writeframes(b''.join(frames))
+
+    return output_path
+
+def start_recording_sync():
+    # Create a new event loop for the thread
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    # Run the asyncio event loop
+    loop.run_until_complete(start_recording())
+    loop.close()
+
+def stop_recording():
+    global recording
+    recording = False
+    print("Stopped recording")
+
+def toggle_recording():
+    global recording
+    if recording:
+        stop_recording()
+    else:
+        # Start recording in a new thread to avoid blocking
+        print("Starting recording")
+        threading.Thread(target=start_recording_sync).start()
+
+is_space_pressed = False  # Flag to track the state of the spacebar
+
+def on_press(key):
+    global is_space_pressed
+    if key == keyboard.Key.space and not is_space_pressed:
+        is_space_pressed = True
+        toggle_recording()
+
+def on_release(key):
+    global is_space_pressed
+    if key == keyboard.Key.space and is_space_pressed:
+        is_space_pressed = False
+        stop_recording()
+    if key == keyboard.Key.esc:
+        # Stop listener
+        return False
+
+# Collect events until released
+with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
+    with tempfile.TemporaryDirectory():
+        print("Press the spacebar to start/stop recording. Press ESC to exit.")
+        listener.join()
+
+p.terminate()