parent
fef311e5b3
commit
4640b4f1a0
File diff suppressed because one or more lines are too long
@ -0,0 +1,482 @@
|
|||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv() # take environment variables from .env.
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import asyncio
|
||||||
|
import threading
|
||||||
|
import pyaudio
|
||||||
|
from pynput import keyboard
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
|
import websockets
|
||||||
|
import queue
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from pydub.playback import play
|
||||||
|
import time
|
||||||
|
import wave
|
||||||
|
import tempfile
|
||||||
|
from datetime import datetime
|
||||||
|
import cv2
|
||||||
|
import base64
|
||||||
|
import platform
|
||||||
|
from interpreter import (
|
||||||
|
interpreter,
|
||||||
|
) # Just for code execution. Maybe we should let people do from interpreter.computer import run?
|
||||||
|
|
||||||
|
# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
|
||||||
|
from ..server.utils.kernel import put_kernel_messages_into_queue
|
||||||
|
from ..server.utils.get_system_info import get_system_info
|
||||||
|
from ..server.utils.process_utils import kill_process_tree
|
||||||
|
|
||||||
|
from ..server.utils.logs import setup_logging
|
||||||
|
from ..server.utils.logs import logger
|
||||||
|
|
||||||
|
setup_logging()
|
||||||
|
|
||||||
|
os.environ["STT_RUNNER"] = "server"
|
||||||
|
os.environ["TTS_RUNNER"] = "server"
|
||||||
|
|
||||||
|
from ..utils.accumulator import Accumulator
|
||||||
|
|
||||||
|
accumulator = Accumulator()
|
||||||
|
|
||||||
|
# Configuration for Audio Recording
|
||||||
|
CHUNK = 1024 # Record in chunks of 1024 samples
|
||||||
|
FORMAT = pyaudio.paInt16 # 16 bits per sample
|
||||||
|
CHANNELS = 1 # Mono
|
||||||
|
RATE = 16000 # Sample rate
|
||||||
|
RECORDING = False # Flag to control recording state
|
||||||
|
SPACEBAR_PRESSED = False # Flag to track spacebar press state
|
||||||
|
|
||||||
|
# Camera configuration
|
||||||
|
CAMERA_ENABLED = os.getenv("CAMERA_ENABLED", False)
|
||||||
|
if type(CAMERA_ENABLED) == str:
|
||||||
|
CAMERA_ENABLED = CAMERA_ENABLED.lower() == "true"
|
||||||
|
CAMERA_DEVICE_INDEX = int(os.getenv("CAMERA_DEVICE_INDEX", 0))
|
||||||
|
CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
|
||||||
|
|
||||||
|
# Specify OS
|
||||||
|
current_platform = get_system_info()
|
||||||
|
|
||||||
|
|
||||||
|
def is_win11():
|
||||||
|
return sys.getwindowsversion().build >= 22000
|
||||||
|
|
||||||
|
|
||||||
|
def is_win10():
|
||||||
|
try:
|
||||||
|
return (
|
||||||
|
platform.system() == "Windows"
|
||||||
|
and "10" in platform.version()
|
||||||
|
and not is_win11()
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize PyAudio
|
||||||
|
p = pyaudio.PyAudio()
|
||||||
|
|
||||||
|
send_queue = queue.Queue()
|
||||||
|
|
||||||
|
|
||||||
|
class Device:
|
||||||
|
def __init__(self):
|
||||||
|
self.pressed_keys = set()
|
||||||
|
self.captured_images = []
|
||||||
|
self.audiosegments = asyncio.Queue()
|
||||||
|
self.server_url = ""
|
||||||
|
self.ctrl_pressed = False
|
||||||
|
self.tts_service = ""
|
||||||
|
self.debug = False
|
||||||
|
self.playback_latency = None
|
||||||
|
|
||||||
|
def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
|
||||||
|
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
|
||||||
|
image_path = None
|
||||||
|
|
||||||
|
cap = cv2.VideoCapture(camera_index)
|
||||||
|
ret, frame = cap.read() # Capture a single frame to initialize the camera
|
||||||
|
|
||||||
|
if CAMERA_WARMUP_SECONDS > 0:
|
||||||
|
# Allow camera to warm up, then snap a picture again
|
||||||
|
# This is a workaround for some cameras that don't return a properly exposed
|
||||||
|
# picture immediately when they are first turned on
|
||||||
|
time.sleep(CAMERA_WARMUP_SECONDS)
|
||||||
|
ret, frame = cap.read()
|
||||||
|
|
||||||
|
if ret:
|
||||||
|
temp_dir = tempfile.gettempdir()
|
||||||
|
image_path = os.path.join(
|
||||||
|
temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png"
|
||||||
|
)
|
||||||
|
self.captured_images.append(image_path)
|
||||||
|
cv2.imwrite(image_path, frame)
|
||||||
|
logger.info(f"Camera image captured to {image_path}")
|
||||||
|
logger.info(
|
||||||
|
f"You now have {len(self.captured_images)} images which will be sent along with your next audio message."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
f"Error: Couldn't capture an image from camera ({camera_index})"
|
||||||
|
)
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
|
||||||
|
return image_path
|
||||||
|
|
||||||
|
def encode_image_to_base64(self, image_path):
|
||||||
|
"""Encodes an image file to a base64 string."""
|
||||||
|
with open(image_path, "rb") as image_file:
|
||||||
|
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
|
|
||||||
|
def add_image_to_send_queue(self, image_path):
|
||||||
|
"""Encodes an image and adds an LMC message to the send queue with the image data."""
|
||||||
|
base64_image = self.encode_image_to_base64(image_path)
|
||||||
|
image_message = {
|
||||||
|
"role": "user",
|
||||||
|
"type": "image",
|
||||||
|
"format": "base64.png",
|
||||||
|
"content": base64_image,
|
||||||
|
}
|
||||||
|
send_queue.put(image_message)
|
||||||
|
# Delete the image file from the file system after sending it
|
||||||
|
os.remove(image_path)
|
||||||
|
|
||||||
|
def queue_all_captured_images(self):
|
||||||
|
"""Queues all captured images to be sent."""
|
||||||
|
for image_path in self.captured_images:
|
||||||
|
self.add_image_to_send_queue(image_path)
|
||||||
|
self.captured_images.clear() # Clear the list after sending
|
||||||
|
|
||||||
|
async def play_audiosegments(self):
|
||||||
|
"""Plays them sequentially."""
|
||||||
|
|
||||||
|
if self.tts_service == "elevenlabs":
|
||||||
|
print("Ensure `mpv` in installed to use `elevenlabs`.\n\n(On macOSX, you can run `brew install mpv`.)")
|
||||||
|
mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
|
||||||
|
mpv_process = subprocess.Popen(
|
||||||
|
mpv_command,
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
audio = await self.audiosegments.get()
|
||||||
|
if self.debug and self.playback_latency and isinstance(audio, bytes):
|
||||||
|
elapsed_time = time.time() - self.playback_latency
|
||||||
|
print(f"Time from request to playback: {elapsed_time} seconds")
|
||||||
|
self.playback_latency = None
|
||||||
|
|
||||||
|
if self.tts_service == "elevenlabs":
|
||||||
|
mpv_process.stdin.write(audio) # type: ignore
|
||||||
|
mpv_process.stdin.flush() # type: ignore
|
||||||
|
else:
|
||||||
|
play(audio)
|
||||||
|
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
except asyncio.exceptions.CancelledError:
|
||||||
|
# This happens once at the start?
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
logger.info(traceback.format_exc())
|
||||||
|
|
||||||
|
def record_audio(self):
|
||||||
|
if os.getenv("STT_RUNNER") == "server":
|
||||||
|
# STT will happen on the server. we're sending audio.
|
||||||
|
send_queue.put(
|
||||||
|
{"role": "user", "type": "audio", "format": "bytes.wav", "start": True}
|
||||||
|
)
|
||||||
|
elif os.getenv("STT_RUNNER") == "client":
|
||||||
|
# STT will happen here, on the client. we're sending text.
|
||||||
|
send_queue.put({"role": "user", "type": "message", "start": True})
|
||||||
|
else:
|
||||||
|
raise Exception("STT_RUNNER must be set to either 'client' or 'server'.")
|
||||||
|
|
||||||
|
"""Record audio from the microphone and add it to the queue."""
|
||||||
|
stream = p.open(
|
||||||
|
format=FORMAT,
|
||||||
|
channels=CHANNELS,
|
||||||
|
rate=RATE,
|
||||||
|
input=True,
|
||||||
|
frames_per_buffer=CHUNK,
|
||||||
|
)
|
||||||
|
print("Recording started...")
|
||||||
|
global RECORDING
|
||||||
|
|
||||||
|
# Create a temporary WAV file to store the audio data
|
||||||
|
temp_dir = tempfile.gettempdir()
|
||||||
|
wav_path = os.path.join(
|
||||||
|
temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
|
||||||
|
)
|
||||||
|
wav_file = wave.open(wav_path, "wb")
|
||||||
|
wav_file.setnchannels(CHANNELS)
|
||||||
|
wav_file.setsampwidth(p.get_sample_size(FORMAT))
|
||||||
|
wav_file.setframerate(RATE)
|
||||||
|
|
||||||
|
while RECORDING:
|
||||||
|
data = stream.read(CHUNK, exception_on_overflow=False)
|
||||||
|
wav_file.writeframes(data)
|
||||||
|
|
||||||
|
wav_file.close()
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
print("Recording stopped.")
|
||||||
|
if self.debug:
|
||||||
|
self.playback_latency = time.time()
|
||||||
|
|
||||||
|
duration = wav_file.getnframes() / RATE
|
||||||
|
if duration < 0.3:
|
||||||
|
# Just pressed it. Send stop message
|
||||||
|
if os.getenv("STT_RUNNER") == "client":
|
||||||
|
send_queue.put({"role": "user", "type": "message", "content": "stop"})
|
||||||
|
send_queue.put({"role": "user", "type": "message", "end": True})
|
||||||
|
else:
|
||||||
|
send_queue.put(
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"type": "audio",
|
||||||
|
"format": "bytes.wav",
|
||||||
|
"content": "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
send_queue.put(
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"type": "audio",
|
||||||
|
"format": "bytes.wav",
|
||||||
|
"end": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.queue_all_captured_images()
|
||||||
|
|
||||||
|
if os.getenv("STT_RUNNER") == "client":
|
||||||
|
# THIS DOES NOT WORK. We moved to this very cool stt_service, llm_service
|
||||||
|
# way of doing things. stt_wav is not a thing anymore. Needs work to work
|
||||||
|
|
||||||
|
# Run stt then send text
|
||||||
|
text = stt_wav(wav_path)
|
||||||
|
logger.debug(f"STT result: {text}")
|
||||||
|
send_queue.put({"role": "user", "type": "message", "content": text})
|
||||||
|
send_queue.put({"role": "user", "type": "message", "end": True})
|
||||||
|
else:
|
||||||
|
# Stream audio
|
||||||
|
with open(wav_path, "rb") as audio_file:
|
||||||
|
byte_data = audio_file.read(CHUNK)
|
||||||
|
while byte_data:
|
||||||
|
send_queue.put(byte_data)
|
||||||
|
byte_data = audio_file.read(CHUNK)
|
||||||
|
send_queue.put(
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"type": "audio",
|
||||||
|
"format": "bytes.wav",
|
||||||
|
"end": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if os.path.exists(wav_path):
|
||||||
|
os.remove(wav_path)
|
||||||
|
|
||||||
|
def toggle_recording(self, state):
|
||||||
|
"""Toggle the recording state."""
|
||||||
|
global RECORDING, SPACEBAR_PRESSED
|
||||||
|
if state and not SPACEBAR_PRESSED:
|
||||||
|
SPACEBAR_PRESSED = True
|
||||||
|
if not RECORDING:
|
||||||
|
RECORDING = True
|
||||||
|
threading.Thread(target=self.record_audio).start()
|
||||||
|
elif not state and SPACEBAR_PRESSED:
|
||||||
|
SPACEBAR_PRESSED = False
|
||||||
|
RECORDING = False
|
||||||
|
|
||||||
|
def on_press(self, key):
|
||||||
|
"""Detect spacebar press and Ctrl+C combination."""
|
||||||
|
self.pressed_keys.add(key) # Add the pressed key to the set
|
||||||
|
|
||||||
|
if keyboard.Key.space in self.pressed_keys:
|
||||||
|
self.toggle_recording(True)
|
||||||
|
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
|
||||||
|
logger.info("Ctrl+C pressed. Exiting...")
|
||||||
|
kill_process_tree()
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
# Windows alternative to the above
|
||||||
|
if key == keyboard.Key.ctrl_l:
|
||||||
|
self.ctrl_pressed = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
if key.vk == 67 and self.ctrl_pressed:
|
||||||
|
logger.info("Ctrl+C pressed. Exiting...")
|
||||||
|
kill_process_tree()
|
||||||
|
os._exit(0)
|
||||||
|
# For non-character keys
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def on_release(self, key):
|
||||||
|
"""Detect spacebar release and 'c' key press for camera, and handle key release."""
|
||||||
|
self.pressed_keys.discard(
|
||||||
|
key
|
||||||
|
) # Remove the released key from the key press tracking set
|
||||||
|
|
||||||
|
if key == keyboard.Key.ctrl_l:
|
||||||
|
self.ctrl_pressed = False
|
||||||
|
if key == keyboard.Key.space:
|
||||||
|
self.toggle_recording(False)
|
||||||
|
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
|
||||||
|
self.fetch_image_from_camera()
|
||||||
|
|
||||||
|
async def message_sender(self, websocket):
|
||||||
|
while True:
|
||||||
|
message = await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None, send_queue.get
|
||||||
|
)
|
||||||
|
if isinstance(message, bytes):
|
||||||
|
await websocket.send(message)
|
||||||
|
else:
|
||||||
|
await websocket.send(json.dumps(message))
|
||||||
|
send_queue.task_done()
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
|
||||||
|
async def websocket_communication(self, WS_URL):
|
||||||
|
show_connection_log = True
|
||||||
|
|
||||||
|
async def exec_ws_communication(websocket):
|
||||||
|
if CAMERA_ENABLED:
|
||||||
|
print(
|
||||||
|
"\nHold the spacebar to start recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("\nHold the spacebar to start recording. Press CTRL-C to exit.")
|
||||||
|
|
||||||
|
asyncio.create_task(self.message_sender(websocket))
|
||||||
|
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
chunk = await websocket.recv()
|
||||||
|
|
||||||
|
logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
|
||||||
|
# print("received chunk from server")
|
||||||
|
|
||||||
|
if type(chunk) == str:
|
||||||
|
chunk = json.loads(chunk)
|
||||||
|
|
||||||
|
if chunk.get("type") == "config":
|
||||||
|
self.tts_service = chunk.get("tts_service")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self.tts_service == "elevenlabs":
|
||||||
|
message = chunk
|
||||||
|
else:
|
||||||
|
message = accumulator.accumulate(chunk)
|
||||||
|
|
||||||
|
if message == None:
|
||||||
|
# Will be None until we have a full message ready
|
||||||
|
continue
|
||||||
|
|
||||||
|
# At this point, we have our message
|
||||||
|
if isinstance(message, bytes) or (
|
||||||
|
message["type"] == "audio" and message["format"].startswith("bytes")
|
||||||
|
):
|
||||||
|
# Convert bytes to audio file
|
||||||
|
if self.tts_service == "elevenlabs":
|
||||||
|
audio_bytes = message
|
||||||
|
audio = audio_bytes
|
||||||
|
else:
|
||||||
|
audio_bytes = message["content"]
|
||||||
|
|
||||||
|
# Create an AudioSegment instance with the raw data
|
||||||
|
audio = AudioSegment(
|
||||||
|
# raw audio data (bytes)
|
||||||
|
data=audio_bytes,
|
||||||
|
# signed 16-bit little-endian format
|
||||||
|
sample_width=2,
|
||||||
|
# 16,000 Hz frame rate
|
||||||
|
frame_rate=22050,
|
||||||
|
# mono sound
|
||||||
|
channels=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
await self.audiosegments.put(audio)
|
||||||
|
|
||||||
|
# Run the code if that's the client's job
|
||||||
|
if os.getenv("CODE_RUNNER") == "client":
|
||||||
|
if message["type"] == "code" and "end" in message:
|
||||||
|
language = message["format"]
|
||||||
|
code = message["content"]
|
||||||
|
result = interpreter.computer.run(language, code)
|
||||||
|
send_queue.put(result)
|
||||||
|
|
||||||
|
if is_win10():
|
||||||
|
logger.info("Windows 10 detected")
|
||||||
|
# Workaround for Windows 10 not latching to the websocket server.
|
||||||
|
# See https://github.com/OpenInterpreter/01/issues/197
|
||||||
|
try:
|
||||||
|
ws = websockets.connect(WS_URL)
|
||||||
|
await exec_ws_communication(ws)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error while attempting to connect: {e}")
|
||||||
|
else:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
async with websockets.connect(WS_URL) as websocket:
|
||||||
|
await exec_ws_communication(websocket)
|
||||||
|
except:
|
||||||
|
logger.debug(traceback.format_exc())
|
||||||
|
if show_connection_log:
|
||||||
|
logger.info(f"Connecting to `{WS_URL}`...")
|
||||||
|
show_connection_log = False
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
async def start_async(self):
|
||||||
|
# Configuration for WebSocket
|
||||||
|
WS_URL = f"ws://{self.server_url}"
|
||||||
|
# Start the WebSocket communication
|
||||||
|
asyncio.create_task(self.websocket_communication(WS_URL))
|
||||||
|
|
||||||
|
# Start watching the kernel if it's your job to do that
|
||||||
|
if os.getenv("CODE_RUNNER") == "client":
|
||||||
|
# client is not running code!
|
||||||
|
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
|
||||||
|
|
||||||
|
asyncio.create_task(self.play_audiosegments())
|
||||||
|
|
||||||
|
# If Raspberry Pi, add the button listener, otherwise use the spacebar
|
||||||
|
if current_platform.startswith("raspberry-pi"):
|
||||||
|
logger.info("Raspberry Pi detected, using button on GPIO pin 15")
|
||||||
|
# Use GPIO pin 15
|
||||||
|
pindef = ["gpiochip4", "15"] # gpiofind PIN15
|
||||||
|
print("PINDEF", pindef)
|
||||||
|
|
||||||
|
# HACK: needs passwordless sudo
|
||||||
|
process = await asyncio.create_subprocess_exec(
|
||||||
|
"sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
line = await process.stdout.readline()
|
||||||
|
if line:
|
||||||
|
line = line.decode().strip()
|
||||||
|
if "FALLING" in line:
|
||||||
|
self.toggle_recording(False)
|
||||||
|
elif "RISING" in line:
|
||||||
|
self.toggle_recording(True)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Keyboard listener for spacebar press/release
|
||||||
|
listener = keyboard.Listener(
|
||||||
|
on_press=self.on_press, on_release=self.on_release
|
||||||
|
)
|
||||||
|
listener.start()
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
if os.getenv("TEACH_MODE") != "True":
|
||||||
|
asyncio.run(self.start_async())
|
||||||
|
p.terminate()
|
@ -1,482 +1,88 @@
|
|||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv() # take environment variables from .env.
|
|
||||||
|
|
||||||
import subprocess
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import threading
|
import websockets
|
||||||
import pyaudio
|
import pyaudio
|
||||||
from pynput import keyboard
|
from pynput import keyboard
|
||||||
import json
|
import json
|
||||||
import traceback
|
|
||||||
import websockets
|
|
||||||
import queue
|
|
||||||
from pydub import AudioSegment
|
|
||||||
from pydub.playback import play
|
|
||||||
import time
|
|
||||||
import wave
|
|
||||||
import tempfile
|
|
||||||
from datetime import datetime
|
|
||||||
import cv2
|
|
||||||
import base64
|
|
||||||
import platform
|
|
||||||
from interpreter import (
|
|
||||||
interpreter,
|
|
||||||
) # Just for code execution. Maybe we should let people do from interpreter.computer import run?
|
|
||||||
|
|
||||||
# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
|
|
||||||
from ..server.utils.kernel import put_kernel_messages_into_queue
|
|
||||||
from ..server.utils.get_system_info import get_system_info
|
|
||||||
from ..server.utils.process_utils import kill_process_tree
|
|
||||||
|
|
||||||
from ..server.utils.logs import setup_logging
|
|
||||||
from ..server.utils.logs import logger
|
|
||||||
|
|
||||||
setup_logging()
|
|
||||||
|
|
||||||
os.environ["STT_RUNNER"] = "server"
|
|
||||||
os.environ["TTS_RUNNER"] = "server"
|
|
||||||
|
|
||||||
from ..utils.accumulator import Accumulator
|
|
||||||
|
|
||||||
accumulator = Accumulator()
|
|
||||||
|
|
||||||
# Configuration for Audio Recording
|
|
||||||
CHUNK = 1024 # Record in chunks of 1024 samples
|
|
||||||
FORMAT = pyaudio.paInt16 # 16 bits per sample
|
|
||||||
CHANNELS = 1 # Mono
|
|
||||||
RATE = 16000 # Sample rate
|
|
||||||
RECORDING = False # Flag to control recording state
|
|
||||||
SPACEBAR_PRESSED = False # Flag to track spacebar press state
|
|
||||||
|
|
||||||
# Camera configuration
|
|
||||||
CAMERA_ENABLED = os.getenv("CAMERA_ENABLED", False)
|
|
||||||
if type(CAMERA_ENABLED) == str:
|
|
||||||
CAMERA_ENABLED = CAMERA_ENABLED.lower() == "true"
|
|
||||||
CAMERA_DEVICE_INDEX = int(os.getenv("CAMERA_DEVICE_INDEX", 0))
|
|
||||||
CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
|
|
||||||
|
|
||||||
# Specify OS
|
|
||||||
current_platform = get_system_info()
|
|
||||||
|
|
||||||
|
|
||||||
def is_win11():
|
|
||||||
return sys.getwindowsversion().build >= 22000
|
|
||||||
|
|
||||||
|
|
||||||
def is_win10():
|
|
||||||
try:
|
|
||||||
return (
|
|
||||||
platform.system() == "Windows"
|
|
||||||
and "10" in platform.version()
|
|
||||||
and not is_win11()
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# Initialize PyAudio
|
|
||||||
p = pyaudio.PyAudio()
|
|
||||||
|
|
||||||
send_queue = queue.Queue()
|
|
||||||
|
|
||||||
|
CHUNK = 1024
|
||||||
|
FORMAT = pyaudio.paInt16
|
||||||
|
CHANNELS = 1
|
||||||
|
RECORDING_RATE = 16000
|
||||||
|
PLAYBACK_RATE = 24000
|
||||||
|
|
||||||
class Device:
|
class Device:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.pressed_keys = set()
|
self.server_url = "0.0.0.0:10001"
|
||||||
self.captured_images = []
|
self.p = pyaudio.PyAudio()
|
||||||
self.audiosegments = asyncio.Queue()
|
self.websocket = None
|
||||||
self.server_url = ""
|
self.recording = False
|
||||||
self.ctrl_pressed = False
|
self.input_stream = None
|
||||||
self.tts_service = ""
|
self.output_stream = None
|
||||||
self.debug = False
|
|
||||||
self.playback_latency = None
|
async def connect_with_retry(self, max_retries=50, retry_delay=2):
|
||||||
|
for attempt in range(max_retries):
|
||||||
def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
|
|
||||||
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
|
|
||||||
image_path = None
|
|
||||||
|
|
||||||
cap = cv2.VideoCapture(camera_index)
|
|
||||||
ret, frame = cap.read() # Capture a single frame to initialize the camera
|
|
||||||
|
|
||||||
if CAMERA_WARMUP_SECONDS > 0:
|
|
||||||
# Allow camera to warm up, then snap a picture again
|
|
||||||
# This is a workaround for some cameras that don't return a properly exposed
|
|
||||||
# picture immediately when they are first turned on
|
|
||||||
time.sleep(CAMERA_WARMUP_SECONDS)
|
|
||||||
ret, frame = cap.read()
|
|
||||||
|
|
||||||
if ret:
|
|
||||||
temp_dir = tempfile.gettempdir()
|
|
||||||
image_path = os.path.join(
|
|
||||||
temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png"
|
|
||||||
)
|
|
||||||
self.captured_images.append(image_path)
|
|
||||||
cv2.imwrite(image_path, frame)
|
|
||||||
logger.info(f"Camera image captured to {image_path}")
|
|
||||||
logger.info(
|
|
||||||
f"You now have {len(self.captured_images)} images which will be sent along with your next audio message."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.error(
|
|
||||||
f"Error: Couldn't capture an image from camera ({camera_index})"
|
|
||||||
)
|
|
||||||
|
|
||||||
cap.release()
|
|
||||||
|
|
||||||
return image_path
|
|
||||||
|
|
||||||
def encode_image_to_base64(self, image_path):
|
|
||||||
"""Encodes an image file to a base64 string."""
|
|
||||||
with open(image_path, "rb") as image_file:
|
|
||||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
|
||||||
|
|
||||||
def add_image_to_send_queue(self, image_path):
|
|
||||||
"""Encodes an image and adds an LMC message to the send queue with the image data."""
|
|
||||||
base64_image = self.encode_image_to_base64(image_path)
|
|
||||||
image_message = {
|
|
||||||
"role": "user",
|
|
||||||
"type": "image",
|
|
||||||
"format": "base64.png",
|
|
||||||
"content": base64_image,
|
|
||||||
}
|
|
||||||
send_queue.put(image_message)
|
|
||||||
# Delete the image file from the file system after sending it
|
|
||||||
os.remove(image_path)
|
|
||||||
|
|
||||||
def queue_all_captured_images(self):
|
|
||||||
"""Queues all captured images to be sent."""
|
|
||||||
for image_path in self.captured_images:
|
|
||||||
self.add_image_to_send_queue(image_path)
|
|
||||||
self.captured_images.clear() # Clear the list after sending
|
|
||||||
|
|
||||||
async def play_audiosegments(self):
|
|
||||||
"""Plays them sequentially."""
|
|
||||||
|
|
||||||
if self.tts_service == "elevenlabs":
|
|
||||||
print("Ensure `mpv` in installed to use `elevenlabs`.\n\n(On macOSX, you can run `brew install mpv`.)")
|
|
||||||
mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
|
|
||||||
mpv_process = subprocess.Popen(
|
|
||||||
mpv_command,
|
|
||||||
stdin=subprocess.PIPE,
|
|
||||||
stdout=subprocess.DEVNULL,
|
|
||||||
stderr=subprocess.DEVNULL,
|
|
||||||
)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
try:
|
||||||
audio = await self.audiosegments.get()
|
self.websocket = await websockets.connect(f"ws://{self.server_url}")
|
||||||
if self.debug and self.playback_latency and isinstance(audio, bytes):
|
print("Connected to server.")
|
||||||
elapsed_time = time.time() - self.playback_latency
|
return
|
||||||
print(f"Time from request to playback: {elapsed_time} seconds")
|
except ConnectionRefusedError:
|
||||||
self.playback_latency = None
|
print(f"Waiting for the server to be ready. Retrying in {retry_delay} seconds...")
|
||||||
|
await asyncio.sleep(retry_delay)
|
||||||
if self.tts_service == "elevenlabs":
|
raise Exception("Failed to connect to the server after multiple attempts")
|
||||||
mpv_process.stdin.write(audio) # type: ignore
|
|
||||||
mpv_process.stdin.flush() # type: ignore
|
async def send_audio(self):
|
||||||
else:
|
self.input_stream = self.p.open(format=FORMAT, channels=CHANNELS, rate=RECORDING_RATE, input=True, frames_per_buffer=CHUNK)
|
||||||
play(audio)
|
while True:
|
||||||
|
if self.recording:
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
except asyncio.exceptions.CancelledError:
|
|
||||||
# This happens once at the start?
|
|
||||||
pass
|
|
||||||
except:
|
|
||||||
logger.info(traceback.format_exc())
|
|
||||||
|
|
||||||
def record_audio(self):
|
|
||||||
if os.getenv("STT_RUNNER") == "server":
|
|
||||||
# STT will happen on the server. we're sending audio.
|
|
||||||
send_queue.put(
|
|
||||||
{"role": "user", "type": "audio", "format": "bytes.wav", "start": True}
|
|
||||||
)
|
|
||||||
elif os.getenv("STT_RUNNER") == "client":
|
|
||||||
# STT will happen here, on the client. we're sending text.
|
|
||||||
send_queue.put({"role": "user", "type": "message", "start": True})
|
|
||||||
else:
|
|
||||||
raise Exception("STT_RUNNER must be set to either 'client' or 'server'.")
|
|
||||||
|
|
||||||
"""Record audio from the microphone and add it to the queue."""
|
|
||||||
stream = p.open(
|
|
||||||
format=FORMAT,
|
|
||||||
channels=CHANNELS,
|
|
||||||
rate=RATE,
|
|
||||||
input=True,
|
|
||||||
frames_per_buffer=CHUNK,
|
|
||||||
)
|
|
||||||
print("Recording started...")
|
|
||||||
global RECORDING
|
|
||||||
|
|
||||||
# Create a temporary WAV file to store the audio data
|
|
||||||
temp_dir = tempfile.gettempdir()
|
|
||||||
wav_path = os.path.join(
|
|
||||||
temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
|
|
||||||
)
|
|
||||||
wav_file = wave.open(wav_path, "wb")
|
|
||||||
wav_file.setnchannels(CHANNELS)
|
|
||||||
wav_file.setsampwidth(p.get_sample_size(FORMAT))
|
|
||||||
wav_file.setframerate(RATE)
|
|
||||||
|
|
||||||
while RECORDING:
|
|
||||||
data = stream.read(CHUNK, exception_on_overflow=False)
|
|
||||||
wav_file.writeframes(data)
|
|
||||||
|
|
||||||
wav_file.close()
|
|
||||||
stream.stop_stream()
|
|
||||||
stream.close()
|
|
||||||
print("Recording stopped.")
|
|
||||||
if self.debug:
|
|
||||||
self.playback_latency = time.time()
|
|
||||||
|
|
||||||
duration = wav_file.getnframes() / RATE
|
|
||||||
if duration < 0.3:
|
|
||||||
# Just pressed it. Send stop message
|
|
||||||
if os.getenv("STT_RUNNER") == "client":
|
|
||||||
send_queue.put({"role": "user", "type": "message", "content": "stop"})
|
|
||||||
send_queue.put({"role": "user", "type": "message", "end": True})
|
|
||||||
else:
|
|
||||||
send_queue.put(
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"type": "audio",
|
|
||||||
"format": "bytes.wav",
|
|
||||||
"content": "",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
send_queue.put(
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"type": "audio",
|
|
||||||
"format": "bytes.wav",
|
|
||||||
"end": True,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.queue_all_captured_images()
|
|
||||||
|
|
||||||
if os.getenv("STT_RUNNER") == "client":
|
|
||||||
# THIS DOES NOT WORK. We moved to this very cool stt_service, llm_service
|
|
||||||
# way of doing things. stt_wav is not a thing anymore. Needs work to work
|
|
||||||
|
|
||||||
# Run stt then send text
|
|
||||||
text = stt_wav(wav_path)
|
|
||||||
logger.debug(f"STT result: {text}")
|
|
||||||
send_queue.put({"role": "user", "type": "message", "content": text})
|
|
||||||
send_queue.put({"role": "user", "type": "message", "end": True})
|
|
||||||
else:
|
|
||||||
# Stream audio
|
|
||||||
with open(wav_path, "rb") as audio_file:
|
|
||||||
byte_data = audio_file.read(CHUNK)
|
|
||||||
while byte_data:
|
|
||||||
send_queue.put(byte_data)
|
|
||||||
byte_data = audio_file.read(CHUNK)
|
|
||||||
send_queue.put(
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"type": "audio",
|
|
||||||
"format": "bytes.wav",
|
|
||||||
"end": True,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
if os.path.exists(wav_path):
|
|
||||||
os.remove(wav_path)
|
|
||||||
|
|
||||||
def toggle_recording(self, state):
|
|
||||||
"""Toggle the recording state."""
|
|
||||||
global RECORDING, SPACEBAR_PRESSED
|
|
||||||
if state and not SPACEBAR_PRESSED:
|
|
||||||
SPACEBAR_PRESSED = True
|
|
||||||
if not RECORDING:
|
|
||||||
RECORDING = True
|
|
||||||
threading.Thread(target=self.record_audio).start()
|
|
||||||
elif not state and SPACEBAR_PRESSED:
|
|
||||||
SPACEBAR_PRESSED = False
|
|
||||||
RECORDING = False
|
|
||||||
|
|
||||||
def on_press(self, key):
|
|
||||||
"""Detect spacebar press and Ctrl+C combination."""
|
|
||||||
self.pressed_keys.add(key) # Add the pressed key to the set
|
|
||||||
|
|
||||||
if keyboard.Key.space in self.pressed_keys:
|
|
||||||
self.toggle_recording(True)
|
|
||||||
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
|
|
||||||
logger.info("Ctrl+C pressed. Exiting...")
|
|
||||||
kill_process_tree()
|
|
||||||
os._exit(0)
|
|
||||||
|
|
||||||
# Windows alternative to the above
|
|
||||||
if key == keyboard.Key.ctrl_l:
|
|
||||||
self.ctrl_pressed = True
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if key.vk == 67 and self.ctrl_pressed:
|
# Send start flag
|
||||||
logger.info("Ctrl+C pressed. Exiting...")
|
await self.websocket.send(json.dumps({"role": "user", "type": "audio", "format": "bytes.wav", "start": True}))
|
||||||
kill_process_tree()
|
print("Sending audio start message")
|
||||||
os._exit(0)
|
|
||||||
# For non-character keys
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def on_release(self, key):
|
while self.recording:
|
||||||
"""Detect spacebar release and 'c' key press for camera, and handle key release."""
|
data = self.input_stream.read(CHUNK, exception_on_overflow=False)
|
||||||
self.pressed_keys.discard(
|
await self.websocket.send(data)
|
||||||
key
|
|
||||||
) # Remove the released key from the key press tracking set
|
|
||||||
|
|
||||||
if key == keyboard.Key.ctrl_l:
|
# Send stop flag
|
||||||
self.ctrl_pressed = False
|
await self.websocket.send(json.dumps({"role": "user", "type": "audio", "format": "bytes.wav", "end": True}))
|
||||||
if key == keyboard.Key.space:
|
print("Sending audio end message")
|
||||||
self.toggle_recording(False)
|
except Exception as e:
|
||||||
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
|
print(f"Error in send_audio: {e}")
|
||||||
self.fetch_image_from_camera()
|
|
||||||
|
|
||||||
async def message_sender(self, websocket):
|
|
||||||
while True:
|
|
||||||
message = await asyncio.get_event_loop().run_in_executor(
|
|
||||||
None, send_queue.get
|
|
||||||
)
|
|
||||||
if isinstance(message, bytes):
|
|
||||||
await websocket.send(message)
|
|
||||||
else:
|
|
||||||
await websocket.send(json.dumps(message))
|
|
||||||
send_queue.task_done()
|
|
||||||
await asyncio.sleep(0.01)
|
await asyncio.sleep(0.01)
|
||||||
|
|
||||||
async def websocket_communication(self, WS_URL):
|
async def receive_audio(self):
|
||||||
show_connection_log = True
|
self.output_stream = self.p.open(format=FORMAT, channels=CHANNELS, rate=PLAYBACK_RATE, output=True, frames_per_buffer=CHUNK)
|
||||||
|
|
||||||
async def exec_ws_communication(websocket):
|
|
||||||
if CAMERA_ENABLED:
|
|
||||||
print(
|
|
||||||
"\nHold the spacebar to start recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
print("\nHold the spacebar to start recording. Press CTRL-C to exit.")
|
|
||||||
|
|
||||||
asyncio.create_task(self.message_sender(websocket))
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
await asyncio.sleep(0.01)
|
|
||||||
chunk = await websocket.recv()
|
|
||||||
|
|
||||||
logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
|
|
||||||
# print("received chunk from server")
|
|
||||||
|
|
||||||
if type(chunk) == str:
|
|
||||||
chunk = json.loads(chunk)
|
|
||||||
|
|
||||||
if chunk.get("type") == "config":
|
|
||||||
self.tts_service = chunk.get("tts_service")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self.tts_service == "elevenlabs":
|
|
||||||
message = chunk
|
|
||||||
else:
|
|
||||||
message = accumulator.accumulate(chunk)
|
|
||||||
|
|
||||||
if message == None:
|
|
||||||
# Will be None until we have a full message ready
|
|
||||||
continue
|
|
||||||
|
|
||||||
# At this point, we have our message
|
|
||||||
if isinstance(message, bytes) or (
|
|
||||||
message["type"] == "audio" and message["format"].startswith("bytes")
|
|
||||||
):
|
|
||||||
# Convert bytes to audio file
|
|
||||||
if self.tts_service == "elevenlabs":
|
|
||||||
audio_bytes = message
|
|
||||||
audio = audio_bytes
|
|
||||||
else:
|
|
||||||
audio_bytes = message["content"]
|
|
||||||
|
|
||||||
# Create an AudioSegment instance with the raw data
|
|
||||||
audio = AudioSegment(
|
|
||||||
# raw audio data (bytes)
|
|
||||||
data=audio_bytes,
|
|
||||||
# signed 16-bit little-endian format
|
|
||||||
sample_width=2,
|
|
||||||
# 16,000 Hz frame rate
|
|
||||||
frame_rate=22050,
|
|
||||||
# mono sound
|
|
||||||
channels=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
await self.audiosegments.put(audio)
|
|
||||||
|
|
||||||
# Run the code if that's the client's job
|
|
||||||
if os.getenv("CODE_RUNNER") == "client":
|
|
||||||
if message["type"] == "code" and "end" in message:
|
|
||||||
language = message["format"]
|
|
||||||
code = message["content"]
|
|
||||||
result = interpreter.computer.run(language, code)
|
|
||||||
send_queue.put(result)
|
|
||||||
|
|
||||||
if is_win10():
|
|
||||||
logger.info("Windows 10 detected")
|
|
||||||
# Workaround for Windows 10 not latching to the websocket server.
|
|
||||||
# See https://github.com/OpenInterpreter/01/issues/197
|
|
||||||
try:
|
try:
|
||||||
ws = websockets.connect(WS_URL)
|
data = await self.websocket.recv()
|
||||||
await exec_ws_communication(ws)
|
if isinstance(data, bytes) and not self.recording:
|
||||||
|
self.output_stream.write(data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error while attempting to connect: {e}")
|
print(f"Error in receive_audio: {e}")
|
||||||
else:
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
async with websockets.connect(WS_URL) as websocket:
|
|
||||||
await exec_ws_communication(websocket)
|
|
||||||
except:
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
if show_connection_log:
|
|
||||||
logger.info(f"Connecting to `{WS_URL}`...")
|
|
||||||
show_connection_log = False
|
|
||||||
await asyncio.sleep(2)
|
|
||||||
|
|
||||||
async def start_async(self):
|
|
||||||
# Configuration for WebSocket
|
|
||||||
WS_URL = f"ws://{self.server_url}"
|
|
||||||
# Start the WebSocket communication
|
|
||||||
asyncio.create_task(self.websocket_communication(WS_URL))
|
|
||||||
|
|
||||||
# Start watching the kernel if it's your job to do that
|
def on_press(self, key):
|
||||||
if os.getenv("CODE_RUNNER") == "client":
|
if key == keyboard.Key.space and not self.recording:
|
||||||
# client is not running code!
|
print("Space pressed, starting recording")
|
||||||
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
|
self.recording = True
|
||||||
|
|
||||||
asyncio.create_task(self.play_audiosegments())
|
|
||||||
|
|
||||||
# If Raspberry Pi, add the button listener, otherwise use the spacebar
|
def on_release(self, key):
|
||||||
if current_platform.startswith("raspberry-pi"):
|
if key == keyboard.Key.space:
|
||||||
logger.info("Raspberry Pi detected, using button on GPIO pin 15")
|
print("Space released, stopping recording")
|
||||||
# Use GPIO pin 15
|
self.recording = False
|
||||||
pindef = ["gpiochip4", "15"] # gpiofind PIN15
|
elif key == keyboard.Key.esc:
|
||||||
print("PINDEF", pindef)
|
print("Esc pressed, stopping the program")
|
||||||
|
return False
|
||||||
|
|
||||||
# HACK: needs passwordless sudo
|
async def main(self):
|
||||||
process = await asyncio.create_subprocess_exec(
|
await self.connect_with_retry()
|
||||||
"sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE
|
print("Hold spacebar to record. Press 'Esc' to quit.")
|
||||||
)
|
listener = keyboard.Listener(on_press=self.on_press, on_release=self.on_release)
|
||||||
while True:
|
|
||||||
line = await process.stdout.readline()
|
|
||||||
if line:
|
|
||||||
line = line.decode().strip()
|
|
||||||
if "FALLING" in line:
|
|
||||||
self.toggle_recording(False)
|
|
||||||
elif "RISING" in line:
|
|
||||||
self.toggle_recording(True)
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
# Keyboard listener for spacebar press/release
|
|
||||||
listener = keyboard.Listener(
|
|
||||||
on_press=self.on_press, on_release=self.on_release
|
|
||||||
)
|
|
||||||
listener.start()
|
listener.start()
|
||||||
|
await asyncio.gather(self.send_audio(), self.receive_audio())
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
if os.getenv("TEACH_MODE") != "True":
|
asyncio.run(self.main())
|
||||||
asyncio.run(self.start_async())
|
|
||||||
p.terminate()
|
if __name__ == "__main__":
|
||||||
|
device = Device()
|
||||||
|
device.start()
|
@ -0,0 +1,124 @@
|
|||||||
|
import asyncio
|
||||||
|
import traceback
|
||||||
|
import json
|
||||||
|
from fastapi import FastAPI, WebSocket, Depends
|
||||||
|
from fastapi.responses import PlainTextResponse
|
||||||
|
from uvicorn import Config, Server
|
||||||
|
from .async_interpreter import AsyncInterpreter
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
import os
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
os.environ["STT_RUNNER"] = "server"
|
||||||
|
os.environ["TTS_RUNNER"] = "server"
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"], # Allow all methods (GET, POST, etc.)
|
||||||
|
allow_headers=["*"], # Allow all headers
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_debug_flag():
|
||||||
|
return app.state.debug
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/ping")
|
||||||
|
async def ping():
|
||||||
|
return PlainTextResponse("pong")
|
||||||
|
|
||||||
|
|
||||||
|
@app.websocket("/")
|
||||||
|
async def websocket_endpoint(
|
||||||
|
websocket: WebSocket, debug: bool = Depends(get_debug_flag)
|
||||||
|
):
|
||||||
|
await websocket.accept()
|
||||||
|
|
||||||
|
global global_interpreter
|
||||||
|
interpreter = global_interpreter
|
||||||
|
|
||||||
|
# Send the tts_service value to the client
|
||||||
|
await websocket.send_text(
|
||||||
|
json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts})
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
async def receive_input():
|
||||||
|
while True:
|
||||||
|
if websocket.client_state == "DISCONNECTED":
|
||||||
|
break
|
||||||
|
|
||||||
|
data = await websocket.receive()
|
||||||
|
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
await interpreter.input(data)
|
||||||
|
elif "bytes" in data:
|
||||||
|
await interpreter.input(data["bytes"])
|
||||||
|
# print("RECEIVED INPUT", data)
|
||||||
|
elif "text" in data:
|
||||||
|
# print("RECEIVED INPUT", data)
|
||||||
|
await interpreter.input(data["text"])
|
||||||
|
|
||||||
|
async def send_output():
|
||||||
|
while True:
|
||||||
|
output = await interpreter.output()
|
||||||
|
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
if isinstance(output, bytes):
|
||||||
|
# print(f"Sending {len(output)} bytes of audio data.")
|
||||||
|
await websocket.send_bytes(output)
|
||||||
|
|
||||||
|
elif isinstance(output, dict):
|
||||||
|
# print("sending text")
|
||||||
|
await websocket.send_text(json.dumps(output))
|
||||||
|
|
||||||
|
await asyncio.gather(send_output(), receive_input())
|
||||||
|
except Exception as e:
|
||||||
|
print(f"WebSocket connection closed with exception: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
finally:
|
||||||
|
if not websocket.client_state == "DISCONNECTED":
|
||||||
|
await websocket.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def main(server_host, server_port, profile, debug):
|
||||||
|
|
||||||
|
app.state.debug = debug
|
||||||
|
|
||||||
|
# Load the profile module from the provided path
|
||||||
|
spec = importlib.util.spec_from_file_location("profile", profile)
|
||||||
|
profile_module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(profile_module)
|
||||||
|
|
||||||
|
# Get the interpreter from the profile
|
||||||
|
interpreter = profile_module.interpreter
|
||||||
|
|
||||||
|
if not hasattr(interpreter, 'tts'):
|
||||||
|
print("Setting TTS provider to default: openai")
|
||||||
|
interpreter.tts = "openai"
|
||||||
|
|
||||||
|
# Make it async
|
||||||
|
interpreter = AsyncInterpreter(interpreter, debug)
|
||||||
|
|
||||||
|
global global_interpreter
|
||||||
|
global_interpreter = interpreter
|
||||||
|
|
||||||
|
print(f"Starting server on {server_host}:{server_port}")
|
||||||
|
config = Config(app, host=server_host, port=server_port, lifespan="on")
|
||||||
|
server = Server(config)
|
||||||
|
await server.serve()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
@ -1,124 +1,125 @@
|
|||||||
import asyncio
|
import importlib
|
||||||
import traceback
|
import traceback
|
||||||
import json
|
import json
|
||||||
from fastapi import FastAPI, WebSocket, Depends
|
|
||||||
from fastapi.responses import PlainTextResponse
|
|
||||||
from uvicorn import Config, Server
|
|
||||||
from .async_interpreter import AsyncInterpreter
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from typing import List, Dict, Any
|
|
||||||
import os
|
import os
|
||||||
import importlib.util
|
from RealtimeTTS import TextToAudioStream, CoquiEngine, OpenAIEngine, ElevenlabsEngine
|
||||||
|
from RealtimeSTT import AudioToTextRecorder
|
||||||
|
import types
|
||||||
|
import time
|
||||||
|
import wave
|
||||||
|
import asyncio
|
||||||
|
from fastapi.responses import PlainTextResponse
|
||||||
|
|
||||||
|
def start_server(server_host, server_port, profile, debug):
|
||||||
|
|
||||||
os.environ["STT_RUNNER"] = "server"
|
# Load the profile module from the provided path
|
||||||
os.environ["TTS_RUNNER"] = "server"
|
spec = importlib.util.spec_from_file_location("profile", profile)
|
||||||
|
profile_module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(profile_module)
|
||||||
|
|
||||||
app = FastAPI()
|
# Get the interpreter from the profile
|
||||||
|
interpreter = profile_module.interpreter
|
||||||
|
|
||||||
app.add_middleware(
|
# STT
|
||||||
CORSMiddleware,
|
interpreter.stt = AudioToTextRecorder(
|
||||||
allow_origins=["*"],
|
model="tiny.en", spinner=False, use_microphone=False
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"], # Allow all methods (GET, POST, etc.)
|
|
||||||
allow_headers=["*"], # Allow all headers
|
|
||||||
)
|
)
|
||||||
|
interpreter.stt.stop() # It needs this for some reason
|
||||||
|
|
||||||
|
|
||||||
async def get_debug_flag():
|
# TTS
|
||||||
return app.state.debug
|
if not hasattr(interpreter, 'tts'):
|
||||||
|
print("Setting TTS provider to default: openai")
|
||||||
|
interpreter.tts = "openai"
|
||||||
@app.get("/ping")
|
if interpreter.tts == "coqui":
|
||||||
async def ping():
|
engine = CoquiEngine()
|
||||||
return PlainTextResponse("pong")
|
elif interpreter.tts == "openai":
|
||||||
|
engine = OpenAIEngine(voice="onyx")
|
||||||
|
elif interpreter.tts == "elevenlabs":
|
||||||
|
engine = ElevenlabsEngine(api_key=os.environ["ELEVEN_LABS_API_KEY"])
|
||||||
|
engine.set_voice("Michael")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported TTS engine: {interpreter.interpreter.tts}")
|
||||||
|
interpreter.tts = TextToAudioStream(engine)
|
||||||
|
|
||||||
@app.websocket("/")
|
# Misc Settings
|
||||||
async def websocket_endpoint(
|
interpreter.verbose = debug
|
||||||
websocket: WebSocket, debug: bool = Depends(get_debug_flag)
|
interpreter.server.host = server_host
|
||||||
):
|
interpreter.server.port = server_port
|
||||||
await websocket.accept()
|
|
||||||
|
|
||||||
global global_interpreter
|
|
||||||
interpreter = global_interpreter
|
|
||||||
|
|
||||||
# Send the tts_service value to the client
|
interpreter.audio_chunks = []
|
||||||
await websocket.send_text(
|
|
||||||
json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts})
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
|
|
||||||
async def receive_input():
|
old_input = interpreter.input
|
||||||
while True:
|
old_output = interpreter.output
|
||||||
if websocket.client_state == "DISCONNECTED":
|
|
||||||
break
|
|
||||||
|
|
||||||
data = await websocket.receive()
|
|
||||||
|
|
||||||
|
async def new_input(self, chunk):
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
|
if isinstance(chunk, bytes):
|
||||||
if isinstance(data, bytes):
|
self.stt.feed_audio(chunk)
|
||||||
await interpreter.input(data)
|
self.audio_chunks.append(chunk)
|
||||||
elif "bytes" in data:
|
elif isinstance(chunk, dict):
|
||||||
await interpreter.input(data["bytes"])
|
if "start" in chunk:
|
||||||
# print("RECEIVED INPUT", data)
|
self.stt.start()
|
||||||
elif "text" in data:
|
self.audio_chunks = []
|
||||||
# print("RECEIVED INPUT", data)
|
await old_input({"role": "user", "type": "message", "start": True})
|
||||||
await interpreter.input(data["text"])
|
if "end" in chunk:
|
||||||
|
self.stt.stop()
|
||||||
async def send_output():
|
content = self.stt.text()
|
||||||
|
|
||||||
|
print("User: ", content)
|
||||||
|
|
||||||
|
if False:
|
||||||
|
audio_bytes = bytearray(b"".join(self.audio_chunks))
|
||||||
|
with wave.open('audio.wav', 'wb') as wav_file:
|
||||||
|
wav_file.setnchannels(1)
|
||||||
|
wav_file.setsampwidth(2) # Assuming 16-bit audio
|
||||||
|
wav_file.setframerate(16000) # Assuming 16kHz sample rate
|
||||||
|
wav_file.writeframes(audio_bytes)
|
||||||
|
print(os.path.abspath('audio.wav'))
|
||||||
|
|
||||||
|
await old_input({"role": "user", "type": "message", "content": content})
|
||||||
|
await old_input({"role": "user", "type": "message", "end": True})
|
||||||
|
|
||||||
|
|
||||||
|
async def new_output(self):
|
||||||
while True:
|
while True:
|
||||||
output = await interpreter.output()
|
output = await old_output()
|
||||||
|
# if output == {"role": "assistant", "type": "message", "start": True}:
|
||||||
await asyncio.sleep(0)
|
# return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
|
||||||
|
|
||||||
if isinstance(output, bytes):
|
if isinstance(output, bytes):
|
||||||
# print(f"Sending {len(output)} bytes of audio data.")
|
return output
|
||||||
await websocket.send_bytes(output)
|
|
||||||
|
|
||||||
elif isinstance(output, dict):
|
|
||||||
# print("sending text")
|
|
||||||
await websocket.send_text(json.dumps(output))
|
|
||||||
|
|
||||||
await asyncio.gather(send_output(), receive_input())
|
|
||||||
except Exception as e:
|
|
||||||
print(f"WebSocket connection closed with exception: {e}")
|
|
||||||
traceback.print_exc()
|
|
||||||
finally:
|
|
||||||
if not websocket.client_state == "DISCONNECTED":
|
|
||||||
await websocket.close()
|
|
||||||
|
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
async def main(server_host, server_port, profile, debug):
|
delimiters = ".?!;,\n…)]}"
|
||||||
|
|
||||||
app.state.debug = debug
|
|
||||||
|
|
||||||
# Load the profile module from the provided path
|
|
||||||
spec = importlib.util.spec_from_file_location("profile", profile)
|
|
||||||
profile_module = importlib.util.module_from_spec(spec)
|
|
||||||
spec.loader.exec_module(profile_module)
|
|
||||||
|
|
||||||
# Get the interpreter from the profile
|
|
||||||
interpreter = profile_module.interpreter
|
|
||||||
|
|
||||||
if not hasattr(interpreter, 'tts'):
|
if output["type"] == "message" and len(output.get("content", "")) > 0:
|
||||||
print("Setting TTS provider to default: openai")
|
self.tts.feed(output.get("content"))
|
||||||
interpreter.tts = "openai"
|
if not self.tts.is_playing() and any([c in delimiters for c in output.get("content")]): # Start playing once the first delimiter is encountered.
|
||||||
|
self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True, sentence_fragment_delimiters=delimiters)
|
||||||
|
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
|
||||||
|
|
||||||
# Make it async
|
if output == {"role": "assistant", "type": "message", "end": True}:
|
||||||
interpreter = AsyncInterpreter(interpreter, debug)
|
if not self.tts.is_playing(): # We put this here in case it never outputs a delimiter and never triggers play_async^
|
||||||
|
self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True, sentence_fragment_delimiters=delimiters)
|
||||||
|
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
|
||||||
|
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "end": True}
|
||||||
|
|
||||||
global global_interpreter
|
def on_tts_chunk(self, chunk):
|
||||||
global_interpreter = interpreter
|
self.output_queue.sync_q.put(chunk)
|
||||||
|
|
||||||
print(f"Starting server on {server_host}:{server_port}")
|
# Wrap in voice interface
|
||||||
config = Config(app, host=server_host, port=server_port, lifespan="on")
|
interpreter.input = types.MethodType(new_input, interpreter)
|
||||||
server = Server(config)
|
interpreter.output = types.MethodType(new_output, interpreter)
|
||||||
await server.serve()
|
interpreter.on_tts_chunk = types.MethodType(on_tts_chunk, interpreter)
|
||||||
|
|
||||||
|
@interpreter.server.app.get("/ping")
|
||||||
|
async def ping():
|
||||||
|
return PlainTextResponse("pong")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
# Start server
|
||||||
asyncio.run(main())
|
interpreter.server.run()
|
Loading…
Reference in new issue