feat: added Whisper stt

2 years ago · f749cb878e
parent a3de4c1286
commit f749cb878e
5 changed files with 248 additions and 13 deletions
--- a/OS/01/assistant/listen.py
+++ b/OS/01/assistant/listen.py
@ -0,0 +1,52 @@
+from fastapi import FastAPI, WebSocket
+import uvicorn
+import json
+from stt import stt
+import tempfile
+
+app = FastAPI()
+
+@app.websocket("/user")
+async def user(ws: WebSocket):
+    await ws.accept()
+    audio_file = bytearray()
+    mime_type = None
+
+    try:
+        while True:
+            message = await ws.receive()
+
+            if message['type'] == 'websocket.disconnect':
+                break
+
+            if message['type'] == 'websocket.receive':
+                if 'text' in message:
+                    control_message = json.loads(message['text'])
+                    if control_message.get('action') == 'command' and control_message.get('state') == 'start' and 'mimeType' in control_message:
+                        # This indicates the start of a new audio file
+                        mime_type = control_message.get('mimeType')
+                    elif control_message.get('action') == 'command' and control_message.get('state') == 'end':
+                        # This indicates the end of the audio file
+                        # Process the complete audio file here
+                        transcription = stt(audio_file, mime_type)
+                        await ws.send_json({"transcript": transcription})
+                        
+                        print("SENT TRANSCRIPTION!")
+
+                        # Reset the bytearray for the next audio file
+                        audio_file = bytearray()
+                        mime_type = None
+                elif 'bytes' in message:
+                    # If it's not a control message, it's part of the audio file
+                    audio_file.extend(message['bytes'])
+                    
+    except Exception as e:
+        print(f"WebSocket connection closed with exception: {e}")
+    finally:
+        await ws.close()
+        print("WebSocket connection closed")
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory():
+        uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/OS/01/assistant/stt.py
+++ b/OS/01/assistant/stt.py
@ -0,0 +1,52 @@
+from datetime import datetime
+import os
+import contextlib
+import tempfile
+import ffmpeg
+import subprocess
+
+from openai import OpenAI
+client = OpenAI()
+
+def convert_mime_type_to_format(mime_type: str) -> str:
+    if mime_type == "audio/x-wav" or mime_type == "audio/wav":
+        return "wav"
+    if mime_type == "audio/webm":
+        return "webm"
+
+    return mime_type
+
+@contextlib.contextmanager
+def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
+    temp_dir = tempfile.gettempdir()
+
+    # Create a temporary file with the appropriate extension
+    input_ext = convert_mime_type_to_format(mime_type)
+    input_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.{input_ext}")
+    with open(input_path, 'wb') as f:
+        f.write(audio)
+
+    # Export to wav
+    output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
+    ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
+
+    print(f"Temporary file path: {output_path}")
+
+    try:
+        yield output_path
+    finally:
+        os.remove(input_path)
+        os.remove(output_path)
+
+
+def stt(audio_bytes: bytearray, mime_type):
+    with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path:
+        audio_file = open(wav_file_path, "rb")
+        transcript = client.audio.transcriptions.create(
+            model="whisper-1", 
+            file=audio_file,
+            response_format="text"
+        )
+
+        print("Exciting transcription result:", transcript)
+        return transcript
--- a/OS/01/requirements.txt
+++ b/OS/01/requirements.txt
@ -1,4 +1,7 @@
 git+https://github.com/KillianLucas/open-interpreter.git
+asyncio
+pyaudio
+pynput
 redis
 fastapi
 uvicorn
--- a/OS/01/user/record.py
+++ b/OS/01/user/record.py
@ -0,0 +1,141 @@
+"""
+Handles everything the user interacts through.
+
+Connects to a websocket at /user. Sends shit to it, and displays/plays the shit it sends back.
+
+For now, just handles a spacebar being pressed— for the duration it's pressed,
+it should record audio.
+
+SIMPLEST POSSIBLE: Sends that audio to OpenAI whisper, gets the transcript,
+sends it to /user in LMC format (role: user, etc)
+
+MOST FUTUREPROOF: Streams chunks of audio to /user, which will then handle stt in stt.py.
+"""
+
+import os
+import pyaudio
+import threading
+import asyncio
+import websockets
+import json
+from pynput import keyboard
+import wave
+import tempfile
+from datetime import datetime
+
+# Configuration
+chunk = 1024  # Record in chunks of 1024 samples
+sample_format = pyaudio.paInt16  # 16 bits per sample
+channels = 1  # Stereo
+fs = 48000 # Sample rate
+
+p = pyaudio.PyAudio()  # Create an interface to PortAudio
+frames = []  # Initialize array to store frames
+recording = False  # Flag to control recording state
+
+ws_chunk_size = 4096 # Websocket stream chunk size
+
+async def start_recording():
+    global recording
+
+    if recording:
+        return  # Avoid multiple starts
+    recording = True
+    frames.clear()  # Clear existing frames
+
+    stream = p.open(format=sample_format,
+                    channels=channels,
+                    rate=fs,
+                    frames_per_buffer=chunk,
+                    input=True)
+
+    print("Recording started...")
+    async with websockets.connect("ws://localhost:8000/user") as websocket:
+        # Send the start command with mime type
+        await websocket.send(json.dumps({"action": "command", "state": "start", "mimeType": "audio/wav"}))
+        while recording:
+            data = stream.read(chunk)
+            frames.append(data)
+
+        stream.stop_stream()
+        stream.close()
+
+        try:
+            file_path = save_recording(frames)
+            with open(file_path, 'rb') as audio_file:
+                byte_chunk = audio_file.read(ws_chunk_size)
+                while byte_chunk:
+                    await websocket.send(byte_chunk)
+                    byte_chunk = audio_file.read(ws_chunk_size)
+        finally:
+            os.remove(file_path)
+
+        # Send the end command
+        await websocket.send(json.dumps({"action": "command", "state": "end"}))
+
+        # Receive a json message and then close the connection
+        message = await websocket.recv()
+        print("Received message:", json.loads(message))
+
+    print("Recording stopped.")
+
+def save_recording(frames) -> str:
+    # Save the recorded data as a WAV file
+    temp_dir = tempfile.gettempdir()
+
+    # Create a temporary file with the appropriate extension
+    output_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
+    with wave.open(output_path, 'wb') as wf:
+        wf.setnchannels(channels)
+        wf.setsampwidth(p.get_sample_size(sample_format))
+        wf.setframerate(fs)
+        wf.writeframes(b''.join(frames))
+
+    return output_path
+
+def start_recording_sync():
+    # Create a new event loop for the thread
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    # Run the asyncio event loop
+    loop.run_until_complete(start_recording())
+    loop.close()
+
+def stop_recording():
+    global recording
+    recording = False
+    print("Stopped recording")
+
+def toggle_recording():
+    global recording
+    if recording:
+        stop_recording()
+    else:
+        # Start recording in a new thread to avoid blocking
+        print("Starting recording")
+        threading.Thread(target=start_recording_sync).start()
+
+is_space_pressed = False  # Flag to track the state of the spacebar
+
+def on_press(key):
+    global is_space_pressed
+    if key == keyboard.Key.space and not is_space_pressed:
+        is_space_pressed = True
+        toggle_recording()
+
+def on_release(key):
+    global is_space_pressed
+    if key == keyboard.Key.space and is_space_pressed:
+        is_space_pressed = False
+        stop_recording()
+    if key == keyboard.Key.esc:
+        # Stop listener
+        return False
+
+# Collect events until released
+with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
+    with tempfile.TemporaryDirectory():
+        print("Press the spacebar to start/stop recording. Press ESC to exit.")
+        listener.join()
+
+p.terminate()
--- a/OS/01/user/user.py
+++ b/OS/01/user/user.py
@ -1,13 +0,0 @@
-"""
-Handles everything the user interacts through.
-
-Connects to a websocket at /user. Sends shit to it, and displays/plays the shit it sends back.
-
-For now, just handles a spacebar being pressed— for the duration it's pressed,
-it should record audio.
-
-SIMPLEST POSSIBLE: Sends that audio to OpenAI whisper, gets the transcript,
-sends it to /user in LMC format (role: user, etc)
-
-MOST FUTUREPROOF: Streams chunks of audio to /user, which will then handle stt in stt.py.
-"""