`bash OS/01/start.sh`

pull/11/head
killian 11 months ago
parent 43d3c4e266
commit 525fa27ceb

@ -6,7 +6,7 @@ Exposes a ws endpoint called /user. Things from there go into the queue. We also
In a while loop we watch the queue and handle it. In a while loop we watch the queue and handle it.
""" """
import os from starlette.websockets import WebSocketDisconnect
import ast import ast
import json import json
import time import time
@ -21,12 +21,12 @@ from starlette.websockets import WebSocket
from create_interpreter import create_interpreter from create_interpreter import create_interpreter
from stt import stt from stt import stt
from tts import tts from tts import tts
from pathlib import Path
# Create interpreter # Create interpreter
interpreter = create_interpreter() interpreter = create_interpreter()
script_dir = os.path.dirname(os.path.abspath(__file__)) conversation_history_path = Path(__file__).parent / 'conversations' / 'user.json'
conversation_history_path = os.path.join(script_dir, 'conversations', 'user.json')
# Create Queue objects # Create Queue objects
to_user = queue.Queue() to_user = queue.Queue()
@ -49,11 +49,16 @@ async def read_computer(item: dict):
async def websocket_endpoint(websocket: WebSocket): async def websocket_endpoint(websocket: WebSocket):
await websocket.accept() await websocket.accept()
while True: while True:
try:
data = await websocket.receive_json() data = await websocket.receive_json()
to_assistant.put(data) to_assistant.put(data)
while not to_user.empty(): while not to_user.empty():
message = to_user.get() message = to_user.get()
print("sending a message!")
await websocket.send_json(message) await websocket.send_json(message)
except WebSocketDisconnect:
pass
def queue_listener(): def queue_listener():
audio_file = bytearray() audio_file = bytearray()
@ -89,25 +94,32 @@ def queue_listener():
accumulated_text = "" accumulated_text = ""
for chunk in interpreter.chat(messages): for chunk in interpreter.chat(messages, stream=True):
# Send it to the user # Send it to the user
to_user.put(chunk) to_user.put(chunk)
# Speak full sentences out loud # Speak full sentences out loud
if chunk["type"] == "assistant": if chunk["role"] == "assistant" and "content" in chunk:
print("Chunk role is assistant and content is present in chunk.")
accumulated_text += chunk["content"] accumulated_text += chunk["content"]
print("Accumulated text: ", accumulated_text)
sentences = split_into_sentences(accumulated_text) sentences = split_into_sentences(accumulated_text)
print("Sentences after splitting: ", sentences)
if is_full_sentence(sentences[-1]): if is_full_sentence(sentences[-1]):
print("Last sentence is a full sentence.")
for sentence in sentences: for sentence in sentences:
for audio_chunk in tts(sentence): print("Streaming sentence: ", sentence)
to_user.put(audio_chunk) stream_tts_to_user(sentence)
accumulated_text = "" accumulated_text = ""
print("Reset accumulated text.")
else: else:
print("Last sentence is not a full sentence.")
for sentence in sentences[:-1]: for sentence in sentences[:-1]:
for audio_chunk in tts(sentence): print("Streaming sentence: ", sentence)
to_user.put(audio_chunk) stream_tts_to_user(sentence)
accumulated_text = sentences[-1] accumulated_text = sentences[-1]
print("Accumulated text is now the last sentence: ", accumulated_text)
# If we have a new message, save our progress and go back to the top # If we have a new message, save our progress and go back to the top
if not to_assistant.empty(): if not to_assistant.empty():
@ -115,6 +127,12 @@ def queue_listener():
json.dump(interpreter.messages, file) json.dump(interpreter.messages, file)
break break
def stream_tts_to_user(sentence):
to_user.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "start": True})
audio_bytes = tts(sentence)
to_user.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "content": str(audio_bytes)})
to_user.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "end": True})
# Create a thread for the queue listener # Create a thread for the queue listener
queue_thread = Thread(target=queue_listener) queue_thread = Thread(target=queue_listener)

@ -38,8 +38,6 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav") output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run() ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
print(f"Temporary file path: {output_path}")
try: try:
yield output_path yield output_path
finally: finally:

@ -2,41 +2,18 @@
Defines a function which takes text and returns a path to an audio file. Defines a function which takes text and returns a path to an audio file.
""" """
from openai import OpenAI
import pydub
import pydub.playback
import tempfile import tempfile
import os from openai import OpenAI
from datetime import datetime
from io import BytesIO
client = OpenAI() client = OpenAI()
chunk_size = 1024
read_chunk_size = 4096
def tts(text): def tts(text):
response = client.audio.speech.create(
temp_dir = tempfile.gettempdir()
output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.mp3")
try:
with (
client.with_streaming_response.audio.speech.create(
model="tts-1", model="tts-1",
voice="alloy", voice="alloy",
input=text, input=text,
response_format='mp3', response_format="mp3"
speed=1.2) )
) as response: with tempfile.NamedTemporaryFile() as temp_file:
with open(output_path, 'wb') as f: response.stream_to_file(temp_file.name)
for chunk in response.iter_bytes(chunk_size): return temp_file.read()
f.write(chunk)
with open(output_path, 'rb') as f:
byte_chunk = f.read(read_chunk_size)
yield byte_chunk
seg = pydub.AudioSegment.from_mp3(output_path)
pydub.playback.play(seg)
finally:
os.remove(output_path)

@ -8,3 +8,4 @@ websockets
python-dotenv python-dotenv
ffmpeg-python ffmpeg-python
textual textual
pydub

@ -6,6 +6,8 @@ import pyaudio
from queue import Queue from queue import Queue
from pynput import keyboard from pynput import keyboard
import json import json
import pydub
import ast
# Configuration for Audio Recording # Configuration for Audio Recording
CHUNK = 1024 # Record in chunks of 1024 samples CHUNK = 1024 # Record in chunks of 1024 samples
@ -77,22 +79,50 @@ def toggle_recording(state):
async def websocket_communication(): async def websocket_communication():
"""Handle WebSocket communication and listen for incoming messages.""" """Handle WebSocket communication and listen for incoming messages."""
while True:
try:
async with websockets.connect(WS_URL) as websocket: async with websockets.connect(WS_URL) as websocket:
print("Press the spacebar to start/stop recording. Press ESC to exit.")
while True: while True:
# Send data from the queue to the server # Send data from the queue to the server
while not data_queue.empty(): while not data_queue.empty():
data = data_queue.get_nowait() data = data_queue.get_nowait()
print(f"Sending data to the server: {data}")
await websocket.send(json.dumps(data)) await websocket.send(json.dumps(data))
# Listen for incoming messages from the server # Listen for incoming messages from the server
try: try:
incoming_message = await asyncio.wait_for(websocket.recv(), timeout=1.0) chunk = await websocket.recv()
print(f"Received from server: {incoming_message}") print(f"Received from server: {str(chunk)[:100]}")
except asyncio.TimeoutError:
# No message received within timeout period if chunk["type"] == "audio":
pass print("Received audio data from server.")
if "start" in chunk:
print("Start of audio data received.")
audio_chunks = bytearray()
if "content" in chunk:
print("Audio content received.")
audio_chunks.extend(bytes(ast.literal_eval(chunk["content"])))
if "end" in chunk:
print("End of audio data received.")
with tempfile.NamedTemporaryFile(suffix=".mp3") as f:
f.write(audio_chunks)
f.seek(0)
seg = pydub.AudioSegment.from_mp3(f.name)
print("Playing received audio.")
pydub.playback.play(seg)
except Exception as e:
print(f"Error receiving data: {e}")
print("Sleeping for 0.05 seconds.")
await asyncio.sleep(0.05)
except Exception as e:
print(f"Websocket not ready, retrying... ({e})")
await asyncio.sleep(1)
await asyncio.sleep(0.1)
def on_press(key): def on_press(key):
@ -101,9 +131,12 @@ def on_press(key):
toggle_recording(True) toggle_recording(True)
def on_release(key): def on_release(key):
"""Detect spacebar release.""" """Detect spacebar release and ESC key press."""
if key == keyboard.Key.space: if key == keyboard.Key.space:
toggle_recording(False) toggle_recording(False)
elif key == keyboard.Key.esc:
print("Exiting...")
os._exit(0)
def main(): def main():
# Start the WebSocket communication in a separate asyncio event loop # Start the WebSocket communication in a separate asyncio event loop
@ -112,7 +145,7 @@ def main():
# Keyboard listener for spacebar press/release # Keyboard listener for spacebar press/release
with keyboard.Listener(on_press=on_press, on_release=on_release) as listener: with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
print("Press the spacebar to start/stop recording. Press ESC to exit.") print("In a moment, press the spacebar to start/stop recording. Press ESC to exit.")
listener.join() listener.join()
p.terminate() p.terminate()

Loading…
Cancel
Save