8th Architecture

pull/11/head
killian 11 months ago
parent e75fa90a48
commit 47b31fb5e1

@ -1 +1 @@
[{"role": "user", "type": "message", "content": "\ub2e4\uc74c \uc601\uc0c1\uc5d0\uc11c \ub9cc\ub098\uc694!\n"}] [{"role": "user", "type": "message", "content": "Yeah, it's explaining why you have to be a paramedic.\n"}, {"role": "user", "type": "message", "content": "\uc5b4\ub9b4\ub54c \uad1c\ucc2e\uc558\ub294\ub370 \uc544 \uadf8\ub798\uc11c \uc544\uce68\uc5d0 \uc9c4\uc9dc \uc548\uac00\uc9c0\uace0 \uc654\ub098\ubd10\uc694 \uc57c \ub098 \uc6ec\ub9cc\ud07c \ub9db\uc788\ub294\ub370\n"}, {"role": "user", "type": "message", "content": "Like, you'd have to go, like, out of houses.\n"}]

@ -6,6 +6,7 @@ from starlette.websockets import WebSocket
from queue import Queue from queue import Queue
from pynput import keyboard from pynput import keyboard
import json import json
import traceback
import websockets import websockets
import queue import queue
import pydub import pydub
@ -13,11 +14,13 @@ import ast
from pydub import AudioSegment from pydub import AudioSegment
from pydub.playback import play from pydub.playback import play
import io import io
import time
import wave import wave
import tempfile import tempfile
from datetime import datetime from datetime import datetime
from utils.check_filtered_kernel import check_filtered_kernel
from interpreter import interpreter # Just for code execution. Maybe we should let people do from interpreter.computer import run? from interpreter import interpreter # Just for code execution. Maybe we should let people do from interpreter.computer import run?
from utils.put_kernel_messages_into_queue import put_kernel_messages_into_queue
from stt import stt_wav
# Configuration for Audio Recording # Configuration for Audio Recording
CHUNK = 1024 # Record in chunks of 1024 samples CHUNK = 1024 # Record in chunks of 1024 samples
@ -36,6 +39,16 @@ if not WS_URL:
p = pyaudio.PyAudio() p = pyaudio.PyAudio()
def record_audio(): def record_audio():
if os.getenv('STT_RUNNER') == "server":
# STT will happen on the server. we're sending audio.
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "start": True})
elif os.getenv('STT_RUNNER') == "device":
# STT will happen here, on the device. we're sending text.
send_queue.put({"role": "user", "type": "message", "start": True})
else:
raise Exception("STT_RUNNER must be set to either 'device' or 'server'.")
"""Record audio from the microphone and add it to the queue.""" """Record audio from the microphone and add it to the queue."""
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
print("Recording started...") print("Recording started...")
@ -65,7 +78,19 @@ def record_audio():
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "content": str(byte_data)}) send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "content": str(byte_data)})
byte_data = audio_file.read(CHUNK) byte_data = audio_file.read(CHUNK)
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "end": True}) if os.getenv('STT_RUNNER') == "device":
text = stt_wav(wav_path)
send_queue.put({"role": "user", "type": "message", "content": text})
if os.getenv('STT_RUNNER') == "server":
# STT will happen on the server. we sent audio.
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "end": True})
elif os.getenv('STT_RUNNER') == "device":
# STT will happen here, on the device. we sent text.
send_queue.put({"role": "user", "type": "message", "end": True})
if os.path.exists(wav_path):
os.remove(wav_path)
def toggle_recording(state): def toggle_recording(state):
@ -114,11 +139,13 @@ async def websocket_communication(WS_URL):
async for message in websocket: async for message in websocket:
print(message)
if "content" in message_so_far: if "content" in message_so_far:
if any(message_so_far[key] != message[key] for key in message_so_far): if any(message_so_far[key] != message[key] for key in message_so_far):
message_so_far = message message_so_far = message
else: else:
message_so_far["content"] += message message_so_far["content"] += message["content"]
if message["type"] == "audio" and "content" in message: if message["type"] == "audio" and "content" in message:
audio_bytes = bytes(ast.literal_eval(message["content"])) audio_bytes = bytes(ast.literal_eval(message["content"]))
@ -140,21 +167,24 @@ async def websocket_communication(WS_URL):
result = interpreter.computer.run(language, code) result = interpreter.computer.run(language, code)
send_queue.put(result) send_queue.put(result)
except: except:
traceback.print_exc()
print(f"Connecting to `{WS_URL}`...") print(f"Connecting to `{WS_URL}`...")
await asyncio.sleep(2) await asyncio.sleep(2)
def main():
# Start the WebSocket communication in a separate asyncio event loop
ws_thread = threading.Thread(target=lambda: asyncio.run(websocket_communication(WS_URL)), daemon=True)
ws_thread.start()
# Keyboard listener for spacebar press/release if __name__ == "__main__":
with keyboard.Listener(on_press=on_press, on_release=on_release) as listener: async def main():
listener.join() # Start the WebSocket communication
asyncio.create_task(websocket_communication(WS_URL))
p.terminate() # Start watching the kernel if it's your job to do that
if os.getenv('CODE_RUNNER') == "device":
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
if __name__ == "__main__": # Keyboard listener for spacebar press/release
main() listener = keyboard.Listener(on_press=on_press, on_release=on_release)
listener.start()
asyncio.run(main())
p.terminate()

@ -4,19 +4,22 @@ import json
import time import time
import queue import queue
import os import os
import traceback
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
import threading
import uvicorn import uvicorn
import re import re
from fastapi import FastAPI from fastapi import FastAPI
from threading import Thread from threading import Thread
from starlette.websockets import WebSocket from starlette.websockets import WebSocket
from stt import stt from stt import stt_bytes
from tts import tts from tts import tts
from pathlib import Path from pathlib import Path
import asyncio import asyncio
from i import configure_interpreter
import urllib.parse import urllib.parse
from utils.put_kernel_messages_into_queue import put_kernel_messages_into_queue
from i import configure_interpreter
from interpreter import interpreter from interpreter import interpreter
app = FastAPI() app = FastAPI()
@ -30,10 +33,10 @@ def is_full_sentence(text):
def split_into_sentences(text): def split_into_sentences(text):
return re.split(r'(?<=[.!?])\s+', text) return re.split(r'(?<=[.!?])\s+', text)
# Global queues # Queues
receive_queue = queue.Queue() from_computer = queue.Queue() # Just for computer messages from the device. Sync queue because interpreter.run is synchronous
send_queue = queue.Queue() from_user = asyncio.Queue() # Just for user messages from the device.
recieve_computer_queue = queue.Queue() # Just for computer messages from the device to_device = asyncio.Queue() # For messages we send.
# Switch code executor to device if that's set # Switch code executor to device if that's set
@ -56,14 +59,14 @@ if os.getenv('CODE_RUNNER') == "device":
# Unless it was just sent to the device, send it wrapped in flags # Unless it was just sent to the device, send it wrapped in flags
if not (interpreter.messages and interpreter.messages[-1] == message): if not (interpreter.messages and interpreter.messages[-1] == message):
send_queue.put({"role": "assistant", "type": "code", "format": "python", "start": True}) to_device.put({"role": "assistant", "type": "code", "format": "python", "start": True})
send_queue.put(message) to_device.put(message)
send_queue.put({"role": "assistant", "type": "code", "format": "python", "end": True}) to_device.put({"role": "assistant", "type": "code", "format": "python", "end": True})
# Stream the response # Stream the response
print("Waiting for the device to respond...") print("Waiting for the device to respond...")
while True: while True:
chunk = recieve_computer_queue.get() chunk = from_computer.get()
print("Server recieved from device:", chunk) print("Server recieved from device:", chunk)
if "end" in chunk: if "end" in chunk:
break break
@ -87,47 +90,52 @@ async def websocket_endpoint(websocket: WebSocket):
await websocket.accept() await websocket.accept()
receive_task = asyncio.create_task(receive_messages(websocket)) receive_task = asyncio.create_task(receive_messages(websocket))
send_task = asyncio.create_task(send_messages(websocket)) send_task = asyncio.create_task(send_messages(websocket))
await asyncio.gather(receive_task, send_task) try:
await asyncio.gather(receive_task, send_task)
except Exception as e:
traceback.print_exc()
print(f"Connection lost. Error: {e}")
async def receive_messages(websocket: WebSocket): async def receive_messages(websocket: WebSocket):
while True: while True:
data = await websocket.receive_text() data = await websocket.receive_json()
if type(data) == dict and data["role"] == "computer": if data["role"] == "computer":
recieve_computer_queue.put(data) # To be handled by interpreter.computer.run from_computer.put(data) # To be handled by interpreter.computer.run
elif data["role"] == "user":
await from_user.put(data)
else: else:
receive_queue.put(data) raise("Unknown role:", data)
async def send_messages(websocket: WebSocket): async def send_messages(websocket: WebSocket):
while True: while True:
message = await asyncio.get_event_loop().run_in_executor(None, send_queue.get) message = await to_device.get()
print(message) print(message)
await websocket.send_json(message) await websocket.send_json(message)
def queue_listener(): async def user_listener():
audio_file = bytearray() audio_bytes = bytearray()
while True: while True:
# Check 10x a second for new messages message = await from_user.get()
while receive_queue.empty():
time.sleep(0.1)
message = receive_queue.get()
message = json.loads(message)
# Hold the audio in a buffer. If it's ready (we got end flag, stt it) # Hold the audio in a buffer. If it's ready (we got end flag, stt it)
if message["type"] == "audio": if message["type"] == "audio":
if "content" in message: if "content" in message:
audio_file.extend(bytes(ast.literal_eval(message["content"]))) audio_bytes.extend(bytes(ast.literal_eval(message["content"])))
if "end" in message: if "end" in message:
content = stt(audio_file, message["format"]) content = stt_bytes(audio_bytes, message["format"])
if content == None: # If it was nothing / silence if content == None: # If it was nothing / silence
continue continue
audio_file = bytearray() audio_bytes = bytearray()
message = {"role": "user", "type": "message", "content": content} message = {"role": "user", "type": "message", "content": content}
else: else:
continue continue
# Ignore flags, we only needed them for audio ^
if "content" not in message:
continue
# Custom stop message will halt us # Custom stop message will halt us
if message.get("content") and message.get("content").lower().strip(".,!") == "stop": if message["content"].lower().strip(".,!") == "stop":
continue continue
# Load, append, and save conversation history # Load, append, and save conversation history
@ -142,53 +150,59 @@ def queue_listener():
for chunk in interpreter.chat(messages, stream=True): for chunk in interpreter.chat(messages, stream=True):
# Send it to the user # Send it to the user
send_queue.put(chunk) await to_device.put(chunk)
# Speak full sentences out loud # Speak full sentences out loud
if chunk["role"] == "assistant" and "content" in chunk: if chunk["role"] == "assistant" and "content" in chunk:
print("Chunk role is assistant and content is present in chunk.")
accumulated_text += chunk["content"] accumulated_text += chunk["content"]
print("Accumulated text: ", accumulated_text)
sentences = split_into_sentences(accumulated_text) sentences = split_into_sentences(accumulated_text)
print("Sentences after splitting: ", sentences)
if is_full_sentence(sentences[-1]): if is_full_sentence(sentences[-1]):
print("Last sentence is a full sentence.")
for sentence in sentences: for sentence in sentences:
print("Streaming sentence: ", sentence) await stream_or_play_tts(sentence)
stream_tts_to_user(sentence)
accumulated_text = "" accumulated_text = ""
print("Reset accumulated text.")
else: else:
print("Last sentence is not a full sentence.")
for sentence in sentences[:-1]: for sentence in sentences[:-1]:
print("Streaming sentence: ", sentence) await stream_or_play_tts(sentence)
stream_tts_to_user(sentence)
accumulated_text = sentences[-1] accumulated_text = sentences[-1]
print("Accumulated text is now the last sentence: ", accumulated_text)
# If we have a new message, save our progress and go back to the top # If we have a new message, save our progress and go back to the top
if not receive_queue.empty(): if not from_user.empty():
with open(conversation_history_path, 'w') as file: with open(conversation_history_path, 'w') as file:
json.dump(interpreter.messages, file) json.dump(interpreter.messages, file)
break break
def stream_tts_to_user(sentence): async def stream_or_play_tts(sentence):
send_queue.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "start": True})
audio_bytes = tts(sentence) if os.getenv('TTS_RUNNER') == "server":
send_queue.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "content": str(audio_bytes)}) tts(sentence, play_audio=True)
send_queue.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "end": True}) else:
await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "start": True})
audio_bytes = tts(sentence, play_audio=False)
await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "content": str(audio_bytes)})
await to_device.put({"role": "assistant", "type": "audio", "format": "audio/mp3", "end": True})
# Create a thread for the queue listener
queue_thread = Thread(target=queue_listener)
# Start the queue listener thread from uvicorn import Config, Server
queue_thread.start()
# Run the FastAPI app # Run the FastAPI app
if __name__ == "__main__": if __name__ == "__main__":
server_url = os.getenv('SERVER_URL')
if not server_url: async def main():
raise ValueError("The environment variable SERVER_URL is not set. Please set it to proceed.") # Start listening to the user
parsed_url = urllib.parse.urlparse(server_url) asyncio.create_task(user_listener())
print("Starting `server.py`...")
uvicorn.run(app, host=parsed_url.hostname, port=parsed_url.port) # Start watching the kernel if it's your job to do that
if os.getenv('CODE_RUNNER') == "server":
asyncio.create_task(put_kernel_messages_into_queue(from_computer))
server_url = os.getenv('SERVER_URL')
if not server_url:
raise ValueError("The environment variable SERVER_URL is not set. Please set it to proceed.")
parsed_url = urllib.parse.urlparse(server_url)
print("Starting `server.py`...")
config = Config(app, host=parsed_url.hostname, port=parsed_url.port, lifespan='on')
server = Server(config)
await server.serve()
asyncio.run(main())

@ -12,7 +12,7 @@ export DEVICE_START=True
# Control where various operations happen— can be `device` or `server`. # Control where various operations happen— can be `device` or `server`.
export CODE_RUNNER=server export CODE_RUNNER=server
export TTS_RUNNER=device # If server, audio will be sent over websocket. export TTS_RUNNER=server # If device, audio will be sent over websocket.
export STT_RUNNER=device # If server, audio will be sent over websocket. export STT_RUNNER=device # If server, audio will be sent over websocket.
# Will expose the server publically and display that URL. # Will expose the server publically and display that URL.
@ -22,10 +22,14 @@ export SERVER_EXPOSE_PUBLICALLY=False
# (for dev, reset the ports we were using) # (for dev, reset the ports we were using)
PORT=$(echo $SERVER_URL | grep -oE "[0-9]+") SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+")
lsof -ti tcp:$PORT | xargs kill if [ -n "$SERVER_PORT" ]; then
PORT=$(echo $DEVICE_URL | grep -oE "[0-9]+") lsof -ti tcp:$SERVER_PORT | xargs kill
lsof -ti tcp:$PORT | xargs kill fi
DEVICE_PORT=$(echo $DEVICE_URL | grep -oE "[0-9]+")
if [ -n "$DEVICE_PORT" ]; then
lsof -ti tcp:$DEVICE_PORT | xargs kill
fi
# Check the current Python version # Check the current Python version
PYTHON_VERSION=$(python -V 2>&1 | cut -d " " -f 2 | cut -d "." -f 1-2) PYTHON_VERSION=$(python -V 2>&1 | cut -d " " -f 2 | cut -d "." -f 1-2)

@ -44,18 +44,21 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
os.remove(input_path) os.remove(input_path)
os.remove(output_path) os.remove(output_path)
def stt(audio_bytes: bytearray, mime_type): def stt_bytes(audio_bytes: bytearray, mime_type="audio/wav"):
with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path: with export_audio_to_wav_ffmpeg(audio_bytes, mime_type) as wav_file_path:
audio_file = open(wav_file_path, "rb") return stt_wav(wav_file_path)
try:
transcript = client.audio.transcriptions.create( def stt_wav(wav_file_path: str):
model="whisper-1", audio_file = open(wav_file_path, "rb")
file=audio_file, try:
response_format="text" transcript = client.audio.transcriptions.create(
) model="whisper-1",
except openai.BadRequestError as e: file=audio_file,
print("openai.BadRequestError:", e) response_format="text"
return None )
except openai.BadRequestError as e:
print("Exciting transcription result:", transcript) print("openai.BadRequestError:", e)
return transcript return None
print("Exciting transcription result:", transcript)
return transcript

@ -9,7 +9,7 @@ from pydub.playback import play
client = OpenAI() client = OpenAI()
def tts(text): def tts(text, play_audio):
response = client.audio.speech.create( response = client.audio.speech.create(
model="tts-1", model="tts-1",
voice="alloy", voice="alloy",
@ -19,9 +19,10 @@ def tts(text):
with tempfile.NamedTemporaryFile() as temp_file: with tempfile.NamedTemporaryFile() as temp_file:
response.stream_to_file(temp_file.name) response.stream_to_file(temp_file.name)
audio = AudioSegment.from_file(temp_file.name, format="mp3") if play_audio:
# Gradual fade in and out over 0.2 seconds audio = AudioSegment.from_file(temp_file.name, format="mp3")
audio = audio.fade_in(200).fade_out(200) # Gradual fade in and out over 0.2 seconds
play(audio) audio = audio.fade_in(200).fade_out(200)
play(audio)
return temp_file.read() return temp_file.read()

@ -1,5 +1,13 @@
"""
Watches the kernel. When it sees something that passes a filter,
it sends POST request with that to /computer.
"""
import subprocess import subprocess
import time
import requests
import platform import platform
import os
def get_kernel_messages(): def get_kernel_messages():
""" """
@ -35,6 +43,7 @@ def custom_filter(message):
last_messages = "" last_messages = ""
def check_filtered_kernel(): def check_filtered_kernel():
while True:
messages = get_kernel_messages() messages = get_kernel_messages()
messages.replace(last_messages, "") messages.replace(last_messages, "")
messages = messages.split("\n") messages = messages.split("\n")
@ -43,4 +52,4 @@ def check_filtered_kernel():
for message in messages: for message in messages:
if custom_filter(message): if custom_filter(message):
filtered_messages.append(message) filtered_messages.append(message)
return filtered_messages return "\n".join(filtered_messages)

@ -0,0 +1,17 @@
from .check_filtered_kernel import check_filtered_kernel
import asyncio
async def put_kernel_messages_into_queue(queue):
while True:
text = check_filtered_kernel()
if text:
if isinstance(queue, asyncio.Queue):
await queue.put({"role": "computer", "type": "console", "start": True})
await queue.put({"role": "computer", "type": "console", "format": "output", "content": text})
await queue.put({"role": "computer", "type": "console", "end": True})
else:
queue.put({"role": "computer", "type": "console", "start": True})
queue.put({"role": "computer", "type": "console", "format": "output", "content": text})
queue.put({"role": "computer", "type": "console", "end": True})
await asyncio.sleep(5)
Loading…
Cancel
Save