chore: integrated local whisper + restructuring

pull/11/head
Shiven Mian 11 months ago
parent 12df8bbfac
commit 23123dc549

4
.gitignore vendored

@ -1,3 +1,5 @@
ggml-*.bin
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
@ -7,7 +9,7 @@ __pycache__/
*.so *.so
# Distribution / packaging # Distribution / packaging
.Python .Pythongit
build/ build/
develop-eggs/ develop-eggs/
dist/ dist/

@ -1 +1 @@
[{"role": "user", "type": "message", "content": "This is a microphone. I also have an extra microphone.\n"}] [{"role": "user", "type": "message", "content": " Hello, how you doing?\n"}]

@ -0,0 +1,7 @@
# Setup
To rebuild the `whisper-rust` executable, do the following:
1. Install [Rust](https://www.rust-lang.org/tools/install), cmake, and Python dependencies `pip install -r requirements.txt`.
2. Go to **core/stt** and run `cargo build --release`.
3. Move the `whisper-rust` executable from target/release to this directory.

@ -1,7 +1,9 @@
### SETTINGS ### SETTINGS
# If ALL_LOCAL is False, we'll use OpenAI's services # If ALL_LOCAL is False, we'll use OpenAI's services
# If setting ALL_LOCAL to true, set the path to the WHISPER local model
export ALL_LOCAL=False export ALL_LOCAL=False
# export WHISPER_MODEL_PATH=...
# export OPENAI_API_KEY=sk-... # export OPENAI_API_KEY=sk-...
# If SERVER_START, this is where we'll serve the server. # If SERVER_START, this is where we'll serve the server.

@ -44,6 +44,28 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
os.remove(input_path) os.remove(input_path)
os.remove(output_path) os.remove(output_path)
def run_command(command):
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return result.stdout, result.stderr
def get_transcription_file(wav_file_path: str):
model_path = os.getenv("WHISPER_MODEL_PATH")
if not model_path:
raise EnvironmentError("WHISPER_MODEL_PATH environment variable is not set.")
output, error = run_command([
os.path.join(os.path.dirname(__file__), 'local_stt', 'whisper-rust', 'whisper-rust'),
'--model-path', model_path,
'--file-path', wav_file_path
])
print("Exciting transcription result:", output)
return output
def get_transcription_bytes(audio_bytes: bytearray, mime_type):
with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path:
return get_transcription_file(wav_file_path)
def stt_bytes(audio_bytes: bytearray, mime_type="audio/wav"): def stt_bytes(audio_bytes: bytearray, mime_type="audio/wav"):
with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path: with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path:
return stt_wav(wav_file_path) return stt_wav(wav_file_path)
@ -65,8 +87,12 @@ def stt_wav(wav_file_path: str):
print("Transcription result:", transcript) print("Transcription result:", transcript)
return transcript return transcript
else: else:
# Local whisper here, given `wav_file_path` temp_dir = tempfile.gettempdir()
pass output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
ffmpeg.input(wav_file_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
transcript = get_transcription_file(output_path)
print("Transcription result:", transcript)
return transcript
def stt(input_data, mime_type="audio/wav"): def stt(input_data, mime_type="audio/wav"):
if isinstance(input_data, str): if isinstance(input_data, str):

@ -1 +0,0 @@
WHISPER_MODEL_PATH=/path/to/ggml-tiny.en.bin

@ -1,9 +0,0 @@
# Setup
1. Install [Rust](https://www.rust-lang.org/tools/install) and Python dependencies `pip install -r requirements.txt`.
2. Go to **core/stt** and run `cargo build --release`.
3. Download GGML Whisper model from [Huggingface](https://huggingface.co/ggerganov/whisper.cpp).
4. In core, copy `.env.example` to `.env` and put the path to model.
5. Run `python core/i_endpoint.py` to start the server.
6. Run `python core/test_cli.py PATH_TO_FILE` to test sending audio to service and getting transcription back over websocket.

@ -1,55 +0,0 @@
from datetime import datetime
import os
import contextlib
import tempfile
import ffmpeg
import subprocess
def convert_mime_type_to_format(mime_type: str) -> str:
if mime_type == "audio/x-wav" or mime_type == "audio/wav":
return "wav"
if mime_type == "audio/webm":
return "webm"
return mime_type
@contextlib.contextmanager
def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
temp_dir = tempfile.gettempdir()
# Create a temporary file with the appropriate extension
input_ext = convert_mime_type_to_format(mime_type)
input_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.{input_ext}")
with open(input_path, 'wb') as f:
f.write(audio)
# Export to wav
output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
print(f"Temporary file path: {output_path}")
try:
yield output_path
finally:
os.remove(input_path)
os.remove(output_path)
def run_command(command):
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return result.stdout, result.stderr
def get_transcription(audio_bytes: bytearray, mime_type):
with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path:
model_path = os.getenv("WHISPER_MODEL_PATH")
if not model_path:
raise EnvironmentError("WHISPER_MODEL_PATH environment variable is not set.")
output, error = run_command([
os.path.join(os.path.dirname(__file__), 'whisper-rust', 'target', 'release', 'whisper-rust'),
'--model-path', model_path,
'--file-path', wav_file_path
])
print("Exciting transcription result:", output)
return output

@ -1,40 +0,0 @@
import argparse
import asyncio
import websockets
import os
import json
# Define the function to send audio file in chunks
async def send_audio_in_chunks(file_path, chunk_size=4096):
async with websockets.connect("ws://localhost:8000/a") as websocket:
# Send the start command with mime type
await websocket.send(json.dumps({"action": "command", "state": "start", "mimeType": "audio/webm"}))
# Open the file in binary mode and send in chunks
with open(file_path, 'rb') as audio_file:
chunk = audio_file.read(chunk_size)
while chunk:
await websocket.send(chunk)
chunk = audio_file.read(chunk_size)
# Send the end command
await websocket.send(json.dumps({"action": "command", "state": "end"}))
# Receive a json message and then close the connection
message = await websocket.recv()
print("Received message:", json.loads(message))
await websocket.close()
# Parse command line arguments
parser = argparse.ArgumentParser(description="Send a webm audio file to the /a websocket endpoint and print the responses.")
parser.add_argument("file_path", help="The path to the webm audio file to send.")
args = parser.parse_args()
# Check if the file exists
if not os.path.isfile(args.file_path):
print(args.file_path)
print("Error: The file does not exist.")
exit(1)
# Run the asyncio event loop
asyncio.get_event_loop().run_until_complete(send_audio_in_chunks(args.file_path))

@ -22,6 +22,10 @@ Official repository for [The 01 Project](https://twitter.com/hellokillian/status
python -m pip install -r requirements.txt python -m pip install -r requirements.txt
``` ```
3. **(optional) Download local audio models**
If you want to run local speech-to-text from whisper, download the GGML Whisper model from [Huggingface](https://huggingface.co/ggerganov/whisper.cpp). Then in `OS/01/start.sh`, set `ALL_LOCAL=TRUE` and set `WHISPER_MODEL_PATH` to the path of the model.
## Usage ## Usage
1. **Navigate to the project directory.** 1. **Navigate to the project directory.**

Loading…
Cancel
Save