diff --git a/.gitignore b/.gitignore index 6769e21..7834d47 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +ggml-*.bin + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -7,7 +9,7 @@ __pycache__/ *.so # Distribution / packaging -.Python +.Pythongit build/ develop-eggs/ dist/ diff --git a/OS/01/conversations/user.json b/OS/01/conversations/user.json index f0d4e3e..53d74b6 100644 --- a/OS/01/conversations/user.json +++ b/OS/01/conversations/user.json @@ -1 +1 @@ -[{"role": "user", "type": "message", "content": "This is a microphone. I also have an extra microphone.\n"}] \ No newline at end of file +[{"role": "user", "type": "message", "content": " Hello, how you doing?\n"}] \ No newline at end of file diff --git a/OS/_archive/core/stt/whisper-rust/.gitignore b/OS/01/local_stt/whisper-rust/.gitignore similarity index 100% rename from OS/_archive/core/stt/whisper-rust/.gitignore rename to OS/01/local_stt/whisper-rust/.gitignore diff --git a/OS/_archive/core/stt/whisper-rust/Cargo.lock b/OS/01/local_stt/whisper-rust/Cargo.lock similarity index 100% rename from OS/_archive/core/stt/whisper-rust/Cargo.lock rename to OS/01/local_stt/whisper-rust/Cargo.lock diff --git a/OS/_archive/core/stt/whisper-rust/Cargo.toml b/OS/01/local_stt/whisper-rust/Cargo.toml similarity index 100% rename from OS/_archive/core/stt/whisper-rust/Cargo.toml rename to OS/01/local_stt/whisper-rust/Cargo.toml diff --git a/OS/01/local_stt/whisper-rust/README.md b/OS/01/local_stt/whisper-rust/README.md new file mode 100644 index 0000000..a4c8c02 --- /dev/null +++ b/OS/01/local_stt/whisper-rust/README.md @@ -0,0 +1,7 @@ +# Setup + +To rebuild the `whisper-rust` executable, do the following: + +1. Install [Rust](https://www.rust-lang.org/tools/install), cmake, and Python dependencies `pip install -r requirements.txt`. +2. Go to **core/stt** and run `cargo build --release`. +3. Move the `whisper-rust` executable from target/release to this directory. diff --git a/OS/_archive/core/stt/whisper-rust/src/main.rs b/OS/01/local_stt/whisper-rust/src/main.rs similarity index 100% rename from OS/_archive/core/stt/whisper-rust/src/main.rs rename to OS/01/local_stt/whisper-rust/src/main.rs diff --git a/OS/_archive/core/stt/whisper-rust/src/transcribe.rs b/OS/01/local_stt/whisper-rust/src/transcribe.rs similarity index 100% rename from OS/_archive/core/stt/whisper-rust/src/transcribe.rs rename to OS/01/local_stt/whisper-rust/src/transcribe.rs diff --git a/OS/01/local_stt/whisper-rust/whisper-rust b/OS/01/local_stt/whisper-rust/whisper-rust new file mode 100755 index 0000000..14a9042 Binary files /dev/null and b/OS/01/local_stt/whisper-rust/whisper-rust differ diff --git a/OS/01/start.sh b/OS/01/start.sh index e4a5df7..694c560 100755 --- a/OS/01/start.sh +++ b/OS/01/start.sh @@ -1,7 +1,9 @@ ### SETTINGS # If ALL_LOCAL is False, we'll use OpenAI's services +# If setting ALL_LOCAL to true, set the path to the WHISPER local model export ALL_LOCAL=False +# export WHISPER_MODEL_PATH=... # export OPENAI_API_KEY=sk-... # If SERVER_START, this is where we'll serve the server. diff --git a/OS/01/stt.py b/OS/01/stt.py index e4b282b..3d1ae2b 100644 --- a/OS/01/stt.py +++ b/OS/01/stt.py @@ -44,6 +44,28 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str: os.remove(input_path) os.remove(output_path) +def run_command(command): + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result.stdout, result.stderr + +def get_transcription_file(wav_file_path: str): + model_path = os.getenv("WHISPER_MODEL_PATH") + if not model_path: + raise EnvironmentError("WHISPER_MODEL_PATH environment variable is not set.") + + output, error = run_command([ + os.path.join(os.path.dirname(__file__), 'local_stt', 'whisper-rust', 'whisper-rust'), + '--model-path', model_path, + '--file-path', wav_file_path + ]) + + print("Exciting transcription result:", output) + return output + +def get_transcription_bytes(audio_bytes: bytearray, mime_type): + with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path: + return get_transcription_file(wav_file_path) + def stt_bytes(audio_bytes: bytearray, mime_type="audio/wav"): with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path: return stt_wav(wav_file_path) @@ -65,8 +87,12 @@ def stt_wav(wav_file_path: str): print("Transcription result:", transcript) return transcript else: - # Local whisper here, given `wav_file_path` - pass + temp_dir = tempfile.gettempdir() + output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav") + ffmpeg.input(wav_file_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run() + transcript = get_transcription_file(output_path) + print("Transcription result:", transcript) + return transcript def stt(input_data, mime_type="audio/wav"): if isinstance(input_data, str): diff --git a/OS/_archive/core/stt/whisper-rust/.env.example b/OS/_archive/core/stt/whisper-rust/.env.example deleted file mode 100644 index e85c4c7..0000000 --- a/OS/_archive/core/stt/whisper-rust/.env.example +++ /dev/null @@ -1 +0,0 @@ -WHISPER_MODEL_PATH=/path/to/ggml-tiny.en.bin \ No newline at end of file diff --git a/OS/_archive/core/stt/whisper-rust/README.md b/OS/_archive/core/stt/whisper-rust/README.md deleted file mode 100644 index e4c881b..0000000 --- a/OS/_archive/core/stt/whisper-rust/README.md +++ /dev/null @@ -1,9 +0,0 @@ - -# Setup - -1. Install [Rust](https://www.rust-lang.org/tools/install) and Python dependencies `pip install -r requirements.txt`. -2. Go to **core/stt** and run `cargo build --release`. -3. Download GGML Whisper model from [Huggingface](https://huggingface.co/ggerganov/whisper.cpp). -4. In core, copy `.env.example` to `.env` and put the path to model. -5. Run `python core/i_endpoint.py` to start the server. -6. Run `python core/test_cli.py PATH_TO_FILE` to test sending audio to service and getting transcription back over websocket. \ No newline at end of file diff --git a/OS/_archive/core/stt/whisper-rust/__init__.py b/OS/_archive/core/stt/whisper-rust/__init__.py deleted file mode 100644 index a7cf036..0000000 --- a/OS/_archive/core/stt/whisper-rust/__init__.py +++ /dev/null @@ -1,55 +0,0 @@ -from datetime import datetime -import os -import contextlib -import tempfile -import ffmpeg -import subprocess - -def convert_mime_type_to_format(mime_type: str) -> str: - if mime_type == "audio/x-wav" or mime_type == "audio/wav": - return "wav" - if mime_type == "audio/webm": - return "webm" - - return mime_type - -@contextlib.contextmanager -def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str: - temp_dir = tempfile.gettempdir() - - # Create a temporary file with the appropriate extension - input_ext = convert_mime_type_to_format(mime_type) - input_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.{input_ext}") - with open(input_path, 'wb') as f: - f.write(audio) - - # Export to wav - output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav") - ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run() - - print(f"Temporary file path: {output_path}") - - try: - yield output_path - finally: - os.remove(input_path) - os.remove(output_path) - -def run_command(command): - result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - return result.stdout, result.stderr - -def get_transcription(audio_bytes: bytearray, mime_type): - with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path: - model_path = os.getenv("WHISPER_MODEL_PATH") - if not model_path: - raise EnvironmentError("WHISPER_MODEL_PATH environment variable is not set.") - - output, error = run_command([ - os.path.join(os.path.dirname(__file__), 'whisper-rust', 'target', 'release', 'whisper-rust'), - '--model-path', model_path, - '--file-path', wav_file_path - ]) - - print("Exciting transcription result:", output) - return output \ No newline at end of file diff --git a/OS/_archive/core/stt/whisper-rust/test_cli.py b/OS/_archive/core/stt/whisper-rust/test_cli.py deleted file mode 100644 index 0569aff..0000000 --- a/OS/_archive/core/stt/whisper-rust/test_cli.py +++ /dev/null @@ -1,40 +0,0 @@ -import argparse -import asyncio -import websockets -import os -import json - -# Define the function to send audio file in chunks -async def send_audio_in_chunks(file_path, chunk_size=4096): - async with websockets.connect("ws://localhost:8000/a") as websocket: - # Send the start command with mime type - await websocket.send(json.dumps({"action": "command", "state": "start", "mimeType": "audio/webm"})) - - # Open the file in binary mode and send in chunks - with open(file_path, 'rb') as audio_file: - chunk = audio_file.read(chunk_size) - while chunk: - await websocket.send(chunk) - chunk = audio_file.read(chunk_size) - - # Send the end command - await websocket.send(json.dumps({"action": "command", "state": "end"})) - - # Receive a json message and then close the connection - message = await websocket.recv() - print("Received message:", json.loads(message)) - await websocket.close() - -# Parse command line arguments -parser = argparse.ArgumentParser(description="Send a webm audio file to the /a websocket endpoint and print the responses.") -parser.add_argument("file_path", help="The path to the webm audio file to send.") -args = parser.parse_args() - -# Check if the file exists -if not os.path.isfile(args.file_path): - print(args.file_path) - print("Error: The file does not exist.") - exit(1) - -# Run the asyncio event loop -asyncio.get_event_loop().run_until_complete(send_audio_in_chunks(args.file_path)) diff --git a/README.md b/README.md index 70e2393..72d8056 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,10 @@ Official repository for [The 01 Project](https://twitter.com/hellokillian/status python -m pip install -r requirements.txt ``` +3. **(optional) Download local audio models** + +If you want to run local speech-to-text from whisper, download the GGML Whisper model from [Huggingface](https://huggingface.co/ggerganov/whisper.cpp). Then in `OS/01/start.sh`, set `ALL_LOCAL=TRUE` and set `WHISPER_MODEL_PATH` to the path of the model. + ## Usage 1. **Navigate to the project directory.**