Use official OI server. 3 second latency.

pull/266/merge
killian 6 months ago
parent fef311e5b3
commit 4640b4f1a0

1
.gitignore vendored

@ -7,6 +7,7 @@ __pycache__/
software/source/server/conversations/user.json software/source/server/conversations/user.json
software/source/server/tts/local_service/* software/source/server/tts/local_service/*
software/source/server/stt/local_service/* software/source/server/stt/local_service/*
software/models/*
# C extensions # C extensions
*.so *.so

5400
software/poetry.lock generated

File diff suppressed because one or more lines are too long

@ -14,8 +14,6 @@ readme = "../README.md"
python = ">=3.9,<3.12" python = ">=3.9,<3.12"
pyaudio = "^0.2.14" pyaudio = "^0.2.14"
pynput = "^1.7.6" pynput = "^1.7.6"
fastapi = "^0.110.0"
uvicorn = "^0.27.1"
websockets = "^12.0" websockets = "^12.0"
python-dotenv = "^1.0.1" python-dotenv = "^1.0.1"
ffmpeg-python = "^0.2.0" ffmpeg-python = "^0.2.0"
@ -25,7 +23,6 @@ ngrok = "^1.0.0"
simpleaudio = "^1.0.4" simpleaudio = "^1.0.4"
opencv-python = "^4.9.0.80" opencv-python = "^4.9.0.80"
psutil = "^5.9.8" psutil = "^5.9.8"
typer = "^0.9.0"
platformdirs = "^4.2.0" platformdirs = "^4.2.0"
rich = "^13.7.1" rich = "^13.7.1"
pytimeparse = "^1.1.8" pytimeparse = "^1.1.8"
@ -33,23 +30,25 @@ python-crontab = "^3.0.0"
inquirer = "^3.2.4" inquirer = "^3.2.4"
pyqrcode = "^1.2.1" pyqrcode = "^1.2.1"
realtimestt = "^0.1.16" realtimestt = "^0.1.16"
realtimetts = "^0.4.1" realtimetts = { version = "^0.4.2", extras = ["all"] }
keyboard = "^0.13.5" keyboard = "^0.13.5"
pyautogui = "^0.9.54" pyautogui = "^0.9.54"
ctranslate2 = "4.1.0" ctranslate2 = "4.1.0"
py3-tts = "^3.5" #py3-tts = "^3.5"
elevenlabs = "1.2.2" #elevenlabs = "1.2.2"
groq = "^0.5.0" groq = "^0.5.0"
open-interpreter = {git = "https://github.com/OpenInterpreter/open-interpreter.git", extras = ["os"]} open-interpreter = {git = "https://github.com/OpenInterpreter/open-interpreter.git", branch = "development", extras = ["os", "server"]}
litellm = "*" litellm = "*"
openai = "*" openai = "*"
pywebview = "*" pywebview = "*"
pyobjc = "*" pyobjc = "*"
sentry-sdk = "^2.4.0" sentry-sdk = "^2.4.0"
plyer = "^2.1.0" plyer = "^2.1.0"
pywinctl = "^0.3" pywinctl = "^0.3"
certifi = "^2024.7.4" certifi = "^2024.7.4"
pygame = "^2.6.0"
mpv = "^1.0.7"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"

@ -0,0 +1,482 @@
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
import subprocess
import os
import sys
import asyncio
import threading
import pyaudio
from pynput import keyboard
import json
import traceback
import websockets
import queue
from pydub import AudioSegment
from pydub.playback import play
import time
import wave
import tempfile
from datetime import datetime
import cv2
import base64
import platform
from interpreter import (
interpreter,
) # Just for code execution. Maybe we should let people do from interpreter.computer import run?
# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
from ..server.utils.kernel import put_kernel_messages_into_queue
from ..server.utils.get_system_info import get_system_info
from ..server.utils.process_utils import kill_process_tree
from ..server.utils.logs import setup_logging
from ..server.utils.logs import logger
setup_logging()
os.environ["STT_RUNNER"] = "server"
os.environ["TTS_RUNNER"] = "server"
from ..utils.accumulator import Accumulator
accumulator = Accumulator()
# Configuration for Audio Recording
CHUNK = 1024 # Record in chunks of 1024 samples
FORMAT = pyaudio.paInt16 # 16 bits per sample
CHANNELS = 1 # Mono
RATE = 16000 # Sample rate
RECORDING = False # Flag to control recording state
SPACEBAR_PRESSED = False # Flag to track spacebar press state
# Camera configuration
CAMERA_ENABLED = os.getenv("CAMERA_ENABLED", False)
if type(CAMERA_ENABLED) == str:
CAMERA_ENABLED = CAMERA_ENABLED.lower() == "true"
CAMERA_DEVICE_INDEX = int(os.getenv("CAMERA_DEVICE_INDEX", 0))
CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
# Specify OS
current_platform = get_system_info()
def is_win11():
return sys.getwindowsversion().build >= 22000
def is_win10():
try:
return (
platform.system() == "Windows"
and "10" in platform.version()
and not is_win11()
)
except:
return False
# Initialize PyAudio
p = pyaudio.PyAudio()
send_queue = queue.Queue()
class Device:
def __init__(self):
self.pressed_keys = set()
self.captured_images = []
self.audiosegments = asyncio.Queue()
self.server_url = ""
self.ctrl_pressed = False
self.tts_service = ""
self.debug = False
self.playback_latency = None
def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
image_path = None
cap = cv2.VideoCapture(camera_index)
ret, frame = cap.read() # Capture a single frame to initialize the camera
if CAMERA_WARMUP_SECONDS > 0:
# Allow camera to warm up, then snap a picture again
# This is a workaround for some cameras that don't return a properly exposed
# picture immediately when they are first turned on
time.sleep(CAMERA_WARMUP_SECONDS)
ret, frame = cap.read()
if ret:
temp_dir = tempfile.gettempdir()
image_path = os.path.join(
temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png"
)
self.captured_images.append(image_path)
cv2.imwrite(image_path, frame)
logger.info(f"Camera image captured to {image_path}")
logger.info(
f"You now have {len(self.captured_images)} images which will be sent along with your next audio message."
)
else:
logger.error(
f"Error: Couldn't capture an image from camera ({camera_index})"
)
cap.release()
return image_path
def encode_image_to_base64(self, image_path):
"""Encodes an image file to a base64 string."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def add_image_to_send_queue(self, image_path):
"""Encodes an image and adds an LMC message to the send queue with the image data."""
base64_image = self.encode_image_to_base64(image_path)
image_message = {
"role": "user",
"type": "image",
"format": "base64.png",
"content": base64_image,
}
send_queue.put(image_message)
# Delete the image file from the file system after sending it
os.remove(image_path)
def queue_all_captured_images(self):
"""Queues all captured images to be sent."""
for image_path in self.captured_images:
self.add_image_to_send_queue(image_path)
self.captured_images.clear() # Clear the list after sending
async def play_audiosegments(self):
"""Plays them sequentially."""
if self.tts_service == "elevenlabs":
print("Ensure `mpv` in installed to use `elevenlabs`.\n\n(On macOSX, you can run `brew install mpv`.)")
mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
mpv_process = subprocess.Popen(
mpv_command,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
while True:
try:
audio = await self.audiosegments.get()
if self.debug and self.playback_latency and isinstance(audio, bytes):
elapsed_time = time.time() - self.playback_latency
print(f"Time from request to playback: {elapsed_time} seconds")
self.playback_latency = None
if self.tts_service == "elevenlabs":
mpv_process.stdin.write(audio) # type: ignore
mpv_process.stdin.flush() # type: ignore
else:
play(audio)
await asyncio.sleep(0.1)
except asyncio.exceptions.CancelledError:
# This happens once at the start?
pass
except:
logger.info(traceback.format_exc())
def record_audio(self):
if os.getenv("STT_RUNNER") == "server":
# STT will happen on the server. we're sending audio.
send_queue.put(
{"role": "user", "type": "audio", "format": "bytes.wav", "start": True}
)
elif os.getenv("STT_RUNNER") == "client":
# STT will happen here, on the client. we're sending text.
send_queue.put({"role": "user", "type": "message", "start": True})
else:
raise Exception("STT_RUNNER must be set to either 'client' or 'server'.")
"""Record audio from the microphone and add it to the queue."""
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
)
print("Recording started...")
global RECORDING
# Create a temporary WAV file to store the audio data
temp_dir = tempfile.gettempdir()
wav_path = os.path.join(
temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
)
wav_file = wave.open(wav_path, "wb")
wav_file.setnchannels(CHANNELS)
wav_file.setsampwidth(p.get_sample_size(FORMAT))
wav_file.setframerate(RATE)
while RECORDING:
data = stream.read(CHUNK, exception_on_overflow=False)
wav_file.writeframes(data)
wav_file.close()
stream.stop_stream()
stream.close()
print("Recording stopped.")
if self.debug:
self.playback_latency = time.time()
duration = wav_file.getnframes() / RATE
if duration < 0.3:
# Just pressed it. Send stop message
if os.getenv("STT_RUNNER") == "client":
send_queue.put({"role": "user", "type": "message", "content": "stop"})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"content": "",
}
)
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"end": True,
}
)
else:
self.queue_all_captured_images()
if os.getenv("STT_RUNNER") == "client":
# THIS DOES NOT WORK. We moved to this very cool stt_service, llm_service
# way of doing things. stt_wav is not a thing anymore. Needs work to work
# Run stt then send text
text = stt_wav(wav_path)
logger.debug(f"STT result: {text}")
send_queue.put({"role": "user", "type": "message", "content": text})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
# Stream audio
with open(wav_path, "rb") as audio_file:
byte_data = audio_file.read(CHUNK)
while byte_data:
send_queue.put(byte_data)
byte_data = audio_file.read(CHUNK)
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"end": True,
}
)
if os.path.exists(wav_path):
os.remove(wav_path)
def toggle_recording(self, state):
"""Toggle the recording state."""
global RECORDING, SPACEBAR_PRESSED
if state and not SPACEBAR_PRESSED:
SPACEBAR_PRESSED = True
if not RECORDING:
RECORDING = True
threading.Thread(target=self.record_audio).start()
elif not state and SPACEBAR_PRESSED:
SPACEBAR_PRESSED = False
RECORDING = False
def on_press(self, key):
"""Detect spacebar press and Ctrl+C combination."""
self.pressed_keys.add(key) # Add the pressed key to the set
if keyboard.Key.space in self.pressed_keys:
self.toggle_recording(True)
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
logger.info("Ctrl+C pressed. Exiting...")
kill_process_tree()
os._exit(0)
# Windows alternative to the above
if key == keyboard.Key.ctrl_l:
self.ctrl_pressed = True
try:
if key.vk == 67 and self.ctrl_pressed:
logger.info("Ctrl+C pressed. Exiting...")
kill_process_tree()
os._exit(0)
# For non-character keys
except:
pass
def on_release(self, key):
"""Detect spacebar release and 'c' key press for camera, and handle key release."""
self.pressed_keys.discard(
key
) # Remove the released key from the key press tracking set
if key == keyboard.Key.ctrl_l:
self.ctrl_pressed = False
if key == keyboard.Key.space:
self.toggle_recording(False)
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
self.fetch_image_from_camera()
async def message_sender(self, websocket):
while True:
message = await asyncio.get_event_loop().run_in_executor(
None, send_queue.get
)
if isinstance(message, bytes):
await websocket.send(message)
else:
await websocket.send(json.dumps(message))
send_queue.task_done()
await asyncio.sleep(0.01)
async def websocket_communication(self, WS_URL):
show_connection_log = True
async def exec_ws_communication(websocket):
if CAMERA_ENABLED:
print(
"\nHold the spacebar to start recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit."
)
else:
print("\nHold the spacebar to start recording. Press CTRL-C to exit.")
asyncio.create_task(self.message_sender(websocket))
while True:
await asyncio.sleep(0.01)
chunk = await websocket.recv()
logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
# print("received chunk from server")
if type(chunk) == str:
chunk = json.loads(chunk)
if chunk.get("type") == "config":
self.tts_service = chunk.get("tts_service")
continue
if self.tts_service == "elevenlabs":
message = chunk
else:
message = accumulator.accumulate(chunk)
if message == None:
# Will be None until we have a full message ready
continue
# At this point, we have our message
if isinstance(message, bytes) or (
message["type"] == "audio" and message["format"].startswith("bytes")
):
# Convert bytes to audio file
if self.tts_service == "elevenlabs":
audio_bytes = message
audio = audio_bytes
else:
audio_bytes = message["content"]
# Create an AudioSegment instance with the raw data
audio = AudioSegment(
# raw audio data (bytes)
data=audio_bytes,
# signed 16-bit little-endian format
sample_width=2,
# 16,000 Hz frame rate
frame_rate=22050,
# mono sound
channels=1,
)
await self.audiosegments.put(audio)
# Run the code if that's the client's job
if os.getenv("CODE_RUNNER") == "client":
if message["type"] == "code" and "end" in message:
language = message["format"]
code = message["content"]
result = interpreter.computer.run(language, code)
send_queue.put(result)
if is_win10():
logger.info("Windows 10 detected")
# Workaround for Windows 10 not latching to the websocket server.
# See https://github.com/OpenInterpreter/01/issues/197
try:
ws = websockets.connect(WS_URL)
await exec_ws_communication(ws)
except Exception as e:
logger.error(f"Error while attempting to connect: {e}")
else:
while True:
try:
async with websockets.connect(WS_URL) as websocket:
await exec_ws_communication(websocket)
except:
logger.debug(traceback.format_exc())
if show_connection_log:
logger.info(f"Connecting to `{WS_URL}`...")
show_connection_log = False
await asyncio.sleep(2)
async def start_async(self):
# Configuration for WebSocket
WS_URL = f"ws://{self.server_url}"
# Start the WebSocket communication
asyncio.create_task(self.websocket_communication(WS_URL))
# Start watching the kernel if it's your job to do that
if os.getenv("CODE_RUNNER") == "client":
# client is not running code!
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
asyncio.create_task(self.play_audiosegments())
# If Raspberry Pi, add the button listener, otherwise use the spacebar
if current_platform.startswith("raspberry-pi"):
logger.info("Raspberry Pi detected, using button on GPIO pin 15")
# Use GPIO pin 15
pindef = ["gpiochip4", "15"] # gpiofind PIN15
print("PINDEF", pindef)
# HACK: needs passwordless sudo
process = await asyncio.create_subprocess_exec(
"sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE
)
while True:
line = await process.stdout.readline()
if line:
line = line.decode().strip()
if "FALLING" in line:
self.toggle_recording(False)
elif "RISING" in line:
self.toggle_recording(True)
else:
break
else:
# Keyboard listener for spacebar press/release
listener = keyboard.Listener(
on_press=self.on_press, on_release=self.on_release
)
listener.start()
def start(self):
if os.getenv("TEACH_MODE") != "True":
asyncio.run(self.start_async())
p.terminate()

@ -1,482 +1,88 @@
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
import subprocess
import os
import sys
import asyncio import asyncio
import threading import websockets
import pyaudio import pyaudio
from pynput import keyboard from pynput import keyboard
import json import json
import traceback
import websockets
import queue
from pydub import AudioSegment
from pydub.playback import play
import time
import wave
import tempfile
from datetime import datetime
import cv2
import base64
import platform
from interpreter import (
interpreter,
) # Just for code execution. Maybe we should let people do from interpreter.computer import run?
# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
from ..server.utils.kernel import put_kernel_messages_into_queue
from ..server.utils.get_system_info import get_system_info
from ..server.utils.process_utils import kill_process_tree
from ..server.utils.logs import setup_logging
from ..server.utils.logs import logger
setup_logging()
os.environ["STT_RUNNER"] = "server"
os.environ["TTS_RUNNER"] = "server"
from ..utils.accumulator import Accumulator
accumulator = Accumulator()
# Configuration for Audio Recording
CHUNK = 1024 # Record in chunks of 1024 samples
FORMAT = pyaudio.paInt16 # 16 bits per sample
CHANNELS = 1 # Mono
RATE = 16000 # Sample rate
RECORDING = False # Flag to control recording state
SPACEBAR_PRESSED = False # Flag to track spacebar press state
# Camera configuration
CAMERA_ENABLED = os.getenv("CAMERA_ENABLED", False)
if type(CAMERA_ENABLED) == str:
CAMERA_ENABLED = CAMERA_ENABLED.lower() == "true"
CAMERA_DEVICE_INDEX = int(os.getenv("CAMERA_DEVICE_INDEX", 0))
CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
# Specify OS
current_platform = get_system_info()
def is_win11():
return sys.getwindowsversion().build >= 22000
def is_win10():
try:
return (
platform.system() == "Windows"
and "10" in platform.version()
and not is_win11()
)
except:
return False
# Initialize PyAudio
p = pyaudio.PyAudio()
send_queue = queue.Queue()
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RECORDING_RATE = 16000
PLAYBACK_RATE = 24000
class Device: class Device:
def __init__(self): def __init__(self):
self.pressed_keys = set() self.server_url = "0.0.0.0:10001"
self.captured_images = [] self.p = pyaudio.PyAudio()
self.audiosegments = asyncio.Queue() self.websocket = None
self.server_url = "" self.recording = False
self.ctrl_pressed = False self.input_stream = None
self.tts_service = "" self.output_stream = None
self.debug = False
self.playback_latency = None async def connect_with_retry(self, max_retries=50, retry_delay=2):
for attempt in range(max_retries):
def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
image_path = None
cap = cv2.VideoCapture(camera_index)
ret, frame = cap.read() # Capture a single frame to initialize the camera
if CAMERA_WARMUP_SECONDS > 0:
# Allow camera to warm up, then snap a picture again
# This is a workaround for some cameras that don't return a properly exposed
# picture immediately when they are first turned on
time.sleep(CAMERA_WARMUP_SECONDS)
ret, frame = cap.read()
if ret:
temp_dir = tempfile.gettempdir()
image_path = os.path.join(
temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png"
)
self.captured_images.append(image_path)
cv2.imwrite(image_path, frame)
logger.info(f"Camera image captured to {image_path}")
logger.info(
f"You now have {len(self.captured_images)} images which will be sent along with your next audio message."
)
else:
logger.error(
f"Error: Couldn't capture an image from camera ({camera_index})"
)
cap.release()
return image_path
def encode_image_to_base64(self, image_path):
"""Encodes an image file to a base64 string."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def add_image_to_send_queue(self, image_path):
"""Encodes an image and adds an LMC message to the send queue with the image data."""
base64_image = self.encode_image_to_base64(image_path)
image_message = {
"role": "user",
"type": "image",
"format": "base64.png",
"content": base64_image,
}
send_queue.put(image_message)
# Delete the image file from the file system after sending it
os.remove(image_path)
def queue_all_captured_images(self):
"""Queues all captured images to be sent."""
for image_path in self.captured_images:
self.add_image_to_send_queue(image_path)
self.captured_images.clear() # Clear the list after sending
async def play_audiosegments(self):
"""Plays them sequentially."""
if self.tts_service == "elevenlabs":
print("Ensure `mpv` in installed to use `elevenlabs`.\n\n(On macOSX, you can run `brew install mpv`.)")
mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
mpv_process = subprocess.Popen(
mpv_command,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
while True:
try: try:
audio = await self.audiosegments.get() self.websocket = await websockets.connect(f"ws://{self.server_url}")
if self.debug and self.playback_latency and isinstance(audio, bytes): print("Connected to server.")
elapsed_time = time.time() - self.playback_latency return
print(f"Time from request to playback: {elapsed_time} seconds") except ConnectionRefusedError:
self.playback_latency = None print(f"Waiting for the server to be ready. Retrying in {retry_delay} seconds...")
await asyncio.sleep(retry_delay)
if self.tts_service == "elevenlabs": raise Exception("Failed to connect to the server after multiple attempts")
mpv_process.stdin.write(audio) # type: ignore
mpv_process.stdin.flush() # type: ignore async def send_audio(self):
else: self.input_stream = self.p.open(format=FORMAT, channels=CHANNELS, rate=RECORDING_RATE, input=True, frames_per_buffer=CHUNK)
play(audio) while True:
if self.recording:
await asyncio.sleep(0.1)
except asyncio.exceptions.CancelledError:
# This happens once at the start?
pass
except:
logger.info(traceback.format_exc())
def record_audio(self):
if os.getenv("STT_RUNNER") == "server":
# STT will happen on the server. we're sending audio.
send_queue.put(
{"role": "user", "type": "audio", "format": "bytes.wav", "start": True}
)
elif os.getenv("STT_RUNNER") == "client":
# STT will happen here, on the client. we're sending text.
send_queue.put({"role": "user", "type": "message", "start": True})
else:
raise Exception("STT_RUNNER must be set to either 'client' or 'server'.")
"""Record audio from the microphone and add it to the queue."""
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
)
print("Recording started...")
global RECORDING
# Create a temporary WAV file to store the audio data
temp_dir = tempfile.gettempdir()
wav_path = os.path.join(
temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
)
wav_file = wave.open(wav_path, "wb")
wav_file.setnchannels(CHANNELS)
wav_file.setsampwidth(p.get_sample_size(FORMAT))
wav_file.setframerate(RATE)
while RECORDING:
data = stream.read(CHUNK, exception_on_overflow=False)
wav_file.writeframes(data)
wav_file.close()
stream.stop_stream()
stream.close()
print("Recording stopped.")
if self.debug:
self.playback_latency = time.time()
duration = wav_file.getnframes() / RATE
if duration < 0.3:
# Just pressed it. Send stop message
if os.getenv("STT_RUNNER") == "client":
send_queue.put({"role": "user", "type": "message", "content": "stop"})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"content": "",
}
)
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"end": True,
}
)
else:
self.queue_all_captured_images()
if os.getenv("STT_RUNNER") == "client":
# THIS DOES NOT WORK. We moved to this very cool stt_service, llm_service
# way of doing things. stt_wav is not a thing anymore. Needs work to work
# Run stt then send text
text = stt_wav(wav_path)
logger.debug(f"STT result: {text}")
send_queue.put({"role": "user", "type": "message", "content": text})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
# Stream audio
with open(wav_path, "rb") as audio_file:
byte_data = audio_file.read(CHUNK)
while byte_data:
send_queue.put(byte_data)
byte_data = audio_file.read(CHUNK)
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"end": True,
}
)
if os.path.exists(wav_path):
os.remove(wav_path)
def toggle_recording(self, state):
"""Toggle the recording state."""
global RECORDING, SPACEBAR_PRESSED
if state and not SPACEBAR_PRESSED:
SPACEBAR_PRESSED = True
if not RECORDING:
RECORDING = True
threading.Thread(target=self.record_audio).start()
elif not state and SPACEBAR_PRESSED:
SPACEBAR_PRESSED = False
RECORDING = False
def on_press(self, key):
"""Detect spacebar press and Ctrl+C combination."""
self.pressed_keys.add(key) # Add the pressed key to the set
if keyboard.Key.space in self.pressed_keys:
self.toggle_recording(True)
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
logger.info("Ctrl+C pressed. Exiting...")
kill_process_tree()
os._exit(0)
# Windows alternative to the above
if key == keyboard.Key.ctrl_l:
self.ctrl_pressed = True
try: try:
if key.vk == 67 and self.ctrl_pressed: # Send start flag
logger.info("Ctrl+C pressed. Exiting...") await self.websocket.send(json.dumps({"role": "user", "type": "audio", "format": "bytes.wav", "start": True}))
kill_process_tree() print("Sending audio start message")
os._exit(0)
# For non-character keys
except:
pass
def on_release(self, key): while self.recording:
"""Detect spacebar release and 'c' key press for camera, and handle key release.""" data = self.input_stream.read(CHUNK, exception_on_overflow=False)
self.pressed_keys.discard( await self.websocket.send(data)
key
) # Remove the released key from the key press tracking set
if key == keyboard.Key.ctrl_l: # Send stop flag
self.ctrl_pressed = False await self.websocket.send(json.dumps({"role": "user", "type": "audio", "format": "bytes.wav", "end": True}))
if key == keyboard.Key.space: print("Sending audio end message")
self.toggle_recording(False) except Exception as e:
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"): print(f"Error in send_audio: {e}")
self.fetch_image_from_camera()
async def message_sender(self, websocket):
while True:
message = await asyncio.get_event_loop().run_in_executor(
None, send_queue.get
)
if isinstance(message, bytes):
await websocket.send(message)
else:
await websocket.send(json.dumps(message))
send_queue.task_done()
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
async def websocket_communication(self, WS_URL): async def receive_audio(self):
show_connection_log = True self.output_stream = self.p.open(format=FORMAT, channels=CHANNELS, rate=PLAYBACK_RATE, output=True, frames_per_buffer=CHUNK)
async def exec_ws_communication(websocket):
if CAMERA_ENABLED:
print(
"\nHold the spacebar to start recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit."
)
else:
print("\nHold the spacebar to start recording. Press CTRL-C to exit.")
asyncio.create_task(self.message_sender(websocket))
while True: while True:
await asyncio.sleep(0.01)
chunk = await websocket.recv()
logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
# print("received chunk from server")
if type(chunk) == str:
chunk = json.loads(chunk)
if chunk.get("type") == "config":
self.tts_service = chunk.get("tts_service")
continue
if self.tts_service == "elevenlabs":
message = chunk
else:
message = accumulator.accumulate(chunk)
if message == None:
# Will be None until we have a full message ready
continue
# At this point, we have our message
if isinstance(message, bytes) or (
message["type"] == "audio" and message["format"].startswith("bytes")
):
# Convert bytes to audio file
if self.tts_service == "elevenlabs":
audio_bytes = message
audio = audio_bytes
else:
audio_bytes = message["content"]
# Create an AudioSegment instance with the raw data
audio = AudioSegment(
# raw audio data (bytes)
data=audio_bytes,
# signed 16-bit little-endian format
sample_width=2,
# 16,000 Hz frame rate
frame_rate=22050,
# mono sound
channels=1,
)
await self.audiosegments.put(audio)
# Run the code if that's the client's job
if os.getenv("CODE_RUNNER") == "client":
if message["type"] == "code" and "end" in message:
language = message["format"]
code = message["content"]
result = interpreter.computer.run(language, code)
send_queue.put(result)
if is_win10():
logger.info("Windows 10 detected")
# Workaround for Windows 10 not latching to the websocket server.
# See https://github.com/OpenInterpreter/01/issues/197
try: try:
ws = websockets.connect(WS_URL) data = await self.websocket.recv()
await exec_ws_communication(ws) if isinstance(data, bytes) and not self.recording:
self.output_stream.write(data)
except Exception as e: except Exception as e:
logger.error(f"Error while attempting to connect: {e}") print(f"Error in receive_audio: {e}")
else:
while True:
try:
async with websockets.connect(WS_URL) as websocket:
await exec_ws_communication(websocket)
except:
logger.debug(traceback.format_exc())
if show_connection_log:
logger.info(f"Connecting to `{WS_URL}`...")
show_connection_log = False
await asyncio.sleep(2)
async def start_async(self):
# Configuration for WebSocket
WS_URL = f"ws://{self.server_url}"
# Start the WebSocket communication
asyncio.create_task(self.websocket_communication(WS_URL))
# Start watching the kernel if it's your job to do that def on_press(self, key):
if os.getenv("CODE_RUNNER") == "client": if key == keyboard.Key.space and not self.recording:
# client is not running code! print("Space pressed, starting recording")
asyncio.create_task(put_kernel_messages_into_queue(send_queue)) self.recording = True
asyncio.create_task(self.play_audiosegments())
# If Raspberry Pi, add the button listener, otherwise use the spacebar def on_release(self, key):
if current_platform.startswith("raspberry-pi"): if key == keyboard.Key.space:
logger.info("Raspberry Pi detected, using button on GPIO pin 15") print("Space released, stopping recording")
# Use GPIO pin 15 self.recording = False
pindef = ["gpiochip4", "15"] # gpiofind PIN15 elif key == keyboard.Key.esc:
print("PINDEF", pindef) print("Esc pressed, stopping the program")
return False
# HACK: needs passwordless sudo async def main(self):
process = await asyncio.create_subprocess_exec( await self.connect_with_retry()
"sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE print("Hold spacebar to record. Press 'Esc' to quit.")
) listener = keyboard.Listener(on_press=self.on_press, on_release=self.on_release)
while True:
line = await process.stdout.readline()
if line:
line = line.decode().strip()
if "FALLING" in line:
self.toggle_recording(False)
elif "RISING" in line:
self.toggle_recording(True)
else:
break
else:
# Keyboard listener for spacebar press/release
listener = keyboard.Listener(
on_press=self.on_press, on_release=self.on_release
)
listener.start() listener.start()
await asyncio.gather(self.send_audio(), self.receive_audio())
def start(self): def start(self):
if os.getenv("TEACH_MODE") != "True": asyncio.run(self.main())
asyncio.run(self.start_async())
p.terminate() if __name__ == "__main__":
device = Device()
device.start()

@ -178,6 +178,7 @@ class AsyncInterpreter:
""" """
self.interpreter.messages = self.active_chat_messages self.interpreter.messages = self.active_chat_messages
self.stt.stop() self.stt.stop()
input_queue = [] input_queue = []

@ -0,0 +1,124 @@
import asyncio
import traceback
import json
from fastapi import FastAPI, WebSocket, Depends
from fastapi.responses import PlainTextResponse
from uvicorn import Config, Server
from .async_interpreter import AsyncInterpreter
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Dict, Any
import os
import importlib.util
os.environ["STT_RUNNER"] = "server"
os.environ["TTS_RUNNER"] = "server"
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"], # Allow all methods (GET, POST, etc.)
allow_headers=["*"], # Allow all headers
)
async def get_debug_flag():
return app.state.debug
@app.get("/ping")
async def ping():
return PlainTextResponse("pong")
@app.websocket("/")
async def websocket_endpoint(
websocket: WebSocket, debug: bool = Depends(get_debug_flag)
):
await websocket.accept()
global global_interpreter
interpreter = global_interpreter
# Send the tts_service value to the client
await websocket.send_text(
json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts})
)
try:
async def receive_input():
while True:
if websocket.client_state == "DISCONNECTED":
break
data = await websocket.receive()
await asyncio.sleep(0)
if isinstance(data, bytes):
await interpreter.input(data)
elif "bytes" in data:
await interpreter.input(data["bytes"])
# print("RECEIVED INPUT", data)
elif "text" in data:
# print("RECEIVED INPUT", data)
await interpreter.input(data["text"])
async def send_output():
while True:
output = await interpreter.output()
await asyncio.sleep(0)
if isinstance(output, bytes):
# print(f"Sending {len(output)} bytes of audio data.")
await websocket.send_bytes(output)
elif isinstance(output, dict):
# print("sending text")
await websocket.send_text(json.dumps(output))
await asyncio.gather(send_output(), receive_input())
except Exception as e:
print(f"WebSocket connection closed with exception: {e}")
traceback.print_exc()
finally:
if not websocket.client_state == "DISCONNECTED":
await websocket.close()
async def main(server_host, server_port, profile, debug):
app.state.debug = debug
# Load the profile module from the provided path
spec = importlib.util.spec_from_file_location("profile", profile)
profile_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(profile_module)
# Get the interpreter from the profile
interpreter = profile_module.interpreter
if not hasattr(interpreter, 'tts'):
print("Setting TTS provider to default: openai")
interpreter.tts = "openai"
# Make it async
interpreter = AsyncInterpreter(interpreter, debug)
global global_interpreter
global_interpreter = interpreter
print(f"Starting server on {server_host}:{server_port}")
config = Config(app, host=server_host, port=server_port, lifespan="on")
server = Server(config)
await server.serve()
if __name__ == "__main__":
asyncio.run(main())

@ -1,124 +1,125 @@
import asyncio import importlib
import traceback import traceback
import json import json
from fastapi import FastAPI, WebSocket, Depends
from fastapi.responses import PlainTextResponse
from uvicorn import Config, Server
from .async_interpreter import AsyncInterpreter
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Dict, Any
import os import os
import importlib.util from RealtimeTTS import TextToAudioStream, CoquiEngine, OpenAIEngine, ElevenlabsEngine
from RealtimeSTT import AudioToTextRecorder
import types
import time
os.environ["STT_RUNNER"] = "server" import wave
os.environ["TTS_RUNNER"] = "server" import asyncio
from fastapi.responses import PlainTextResponse
app = FastAPI()
app.add_middleware( def start_server(server_host, server_port, profile, debug):
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"], # Allow all methods (GET, POST, etc.)
allow_headers=["*"], # Allow all headers
)
# Load the profile module from the provided path
spec = importlib.util.spec_from_file_location("profile", profile)
profile_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(profile_module)
async def get_debug_flag(): # Get the interpreter from the profile
return app.state.debug interpreter = profile_module.interpreter
# STT
interpreter.stt = AudioToTextRecorder(
model="tiny.en", spinner=False, use_microphone=False
)
interpreter.stt.stop() # It needs this for some reason
@app.get("/ping")
async def ping():
return PlainTextResponse("pong")
# TTS
if not hasattr(interpreter, 'tts'):
print("Setting TTS provider to default: openai")
interpreter.tts = "openai"
if interpreter.tts == "coqui":
engine = CoquiEngine()
elif interpreter.tts == "openai":
engine = OpenAIEngine(voice="onyx")
elif interpreter.tts == "elevenlabs":
engine = ElevenlabsEngine(api_key=os.environ["ELEVEN_LABS_API_KEY"])
engine.set_voice("Michael")
else:
raise ValueError(f"Unsupported TTS engine: {interpreter.interpreter.tts}")
interpreter.tts = TextToAudioStream(engine)
@app.websocket("/") # Misc Settings
async def websocket_endpoint( interpreter.verbose = debug
websocket: WebSocket, debug: bool = Depends(get_debug_flag) interpreter.server.host = server_host
): interpreter.server.port = server_port
await websocket.accept()
global global_interpreter
interpreter = global_interpreter
# Send the tts_service value to the client interpreter.audio_chunks = []
await websocket.send_text(
json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts})
)
try:
async def receive_input(): old_input = interpreter.input
while True: old_output = interpreter.output
if websocket.client_state == "DISCONNECTED":
break
data = await websocket.receive()
async def new_input(self, chunk):
await asyncio.sleep(0) await asyncio.sleep(0)
if isinstance(chunk, bytes):
if isinstance(data, bytes): self.stt.feed_audio(chunk)
await interpreter.input(data) self.audio_chunks.append(chunk)
elif "bytes" in data: elif isinstance(chunk, dict):
await interpreter.input(data["bytes"]) if "start" in chunk:
# print("RECEIVED INPUT", data) self.stt.start()
elif "text" in data: self.audio_chunks = []
# print("RECEIVED INPUT", data) await old_input({"role": "user", "type": "message", "start": True})
await interpreter.input(data["text"]) if "end" in chunk:
self.stt.stop()
async def send_output(): content = self.stt.text()
print("User: ", content)
if False:
audio_bytes = bytearray(b"".join(self.audio_chunks))
with wave.open('audio.wav', 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2) # Assuming 16-bit audio
wav_file.setframerate(16000) # Assuming 16kHz sample rate
wav_file.writeframes(audio_bytes)
print(os.path.abspath('audio.wav'))
await old_input({"role": "user", "type": "message", "content": content})
await old_input({"role": "user", "type": "message", "end": True})
async def new_output(self):
while True: while True:
output = await interpreter.output() output = await old_output()
# if output == {"role": "assistant", "type": "message", "start": True}:
await asyncio.sleep(0) # return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
if isinstance(output, bytes): if isinstance(output, bytes):
# print(f"Sending {len(output)} bytes of audio data.") return output
await websocket.send_bytes(output)
elif isinstance(output, dict):
# print("sending text")
await websocket.send_text(json.dumps(output))
await asyncio.gather(send_output(), receive_input()) await asyncio.sleep(0)
except Exception as e:
print(f"WebSocket connection closed with exception: {e}")
traceback.print_exc()
finally:
if not websocket.client_state == "DISCONNECTED":
await websocket.close()
async def main(server_host, server_port, profile, debug):
app.state.debug = debug
# Load the profile module from the provided path
spec = importlib.util.spec_from_file_location("profile", profile)
profile_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(profile_module)
# Get the interpreter from the profile delimiters = ".?!;,\n…)]}"
interpreter = profile_module.interpreter
if not hasattr(interpreter, 'tts'): if output["type"] == "message" and len(output.get("content", "")) > 0:
print("Setting TTS provider to default: openai") self.tts.feed(output.get("content"))
interpreter.tts = "openai" if not self.tts.is_playing() and any([c in delimiters for c in output.get("content")]): # Start playing once the first delimiter is encountered.
self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True, sentence_fragment_delimiters=delimiters)
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
# Make it async if output == {"role": "assistant", "type": "message", "end": True}:
interpreter = AsyncInterpreter(interpreter, debug) if not self.tts.is_playing(): # We put this here in case it never outputs a delimiter and never triggers play_async^
self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True, sentence_fragment_delimiters=delimiters)
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "end": True}
global global_interpreter def on_tts_chunk(self, chunk):
global_interpreter = interpreter self.output_queue.sync_q.put(chunk)
print(f"Starting server on {server_host}:{server_port}") # Wrap in voice interface
config = Config(app, host=server_host, port=server_port, lifespan="on") interpreter.input = types.MethodType(new_input, interpreter)
server = Server(config) interpreter.output = types.MethodType(new_output, interpreter)
await server.serve() interpreter.on_tts_chunk = types.MethodType(on_tts_chunk, interpreter)
@interpreter.server.app.get("/ping")
async def ping():
return PlainTextResponse("pong")
if __name__ == "__main__": # Start server
asyncio.run(main()) interpreter.server.run()

@ -1,4 +1,5 @@
from interpreter import interpreter from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
# This is an Open Interpreter compatible profile. # This is an Open Interpreter compatible profile.
# Visit https://01.openinterpreter.com/profile for all options. # Visit https://01.openinterpreter.com/profile for all options.

@ -1,4 +1,5 @@
from interpreter import interpreter from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
# This is an Open Interpreter compatible profile. # This is an Open Interpreter compatible profile.
# Visit https://01.openinterpreter.com/profile for all options. # Visit https://01.openinterpreter.com/profile for all options.
@ -7,14 +8,12 @@ from interpreter import interpreter
# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
interpreter.tts = "elevenlabs" interpreter.tts = "elevenlabs"
# 01 Language Model Config. interpreter.llm.model = "groq/llama3-70b-8192"
interpreter.llm_service = "litellm"
interpreter.llm.model = "groq/llama3-8b-8192"
interpreter.llm.supports_vision = False interpreter.llm.supports_vision = False
interpreter.llm.supports_functions = False interpreter.llm.supports_functions = False
interpreter.llm.context_window = 2048 interpreter.llm.context_window = 8000
interpreter.llm.max_tokens = 4096 interpreter.llm.max_tokens = 1000
interpreter.llm.temperature = 0.8 interpreter.llm.temperature = 0
interpreter.computer.import_computer_api = False interpreter.computer.import_computer_api = False

@ -1,12 +1,10 @@
from interpreter import interpreter from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers # 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"} # {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
interpreter.tts = "coqui" interpreter.tts = "coqui"
# Local setup
interpreter.local_setup()
interpreter.system_message = """You are an AI assistant that writes markdown code snippets to answer the user's request. You speak very concisely and quickly, you say nothing irrelevant to the user's request. For example: interpreter.system_message = """You are an AI assistant that writes markdown code snippets to answer the user's request. You speak very concisely and quickly, you say nothing irrelevant to the user's request. For example:
User: Open the chrome app. User: Open the chrome app.
@ -17,22 +15,57 @@ webbrowser.open('https://chrome.google.com')
``` ```
User: The code you ran produced no output. Was this expected, or are we finished? User: The code you ran produced no output. Was this expected, or are we finished?
Assistant: No further action is required; the provided snippet opens Chrome. Assistant: No further action is required; the provided snippet opens Chrome.
User: How large are all the files on my desktop combined?
Assistant: I will sum up the file sizes of every file on your desktop.
```python
import os
import string
from pathlib import Path
# Get the user's home directory in a cross-platform way
home_dir = Path.home()
# Define the path to the desktop
desktop_dir = home_dir / 'Desktop'
# Initialize a variable to store the total size
total_size = 0
Now, your turn:""" # Loop through all files on the desktop
for file in desktop_dir.iterdir():
# Add the file size to the total
total_size += file.stat().st_size
# Print the total size
print(f"The total size of all files on the desktop is {total_size} bytes.")
```
User: I executed that code. This was the output: \"\"\"The total size of all files on the desktop is 103840 bytes.\"\"\"\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.
Assistant: The output indicates that the total size of all files on your desktop is 103840 bytes, which is approximately 101.4 KB or 0.1 MB. We are finished.
NEVER use placeholders, NEVER say "path/to/desktop", NEVER say "path/to/file". Always specify exact paths, and use cross-platform ways of determining the desktop, documents, cwd, etc. folders.
Now, your turn:""".strip()
# Message templates # Message templates
interpreter.code_output_template = '''I executed that code. This was the output: """{content}"""\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.''' interpreter.code_output_template = '''I executed that code. This was the output: """{content}"""\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.'''
interpreter.empty_code_output_template = "The code above was executed on my machine. It produced no text output. What's next (if anything, or are we done?)" interpreter.empty_code_output_template = "The code above was executed on my machine. It produced no text output. What's next (if anything, or are we done?)"
interpreter.code_output_sender = "user" interpreter.code_output_sender = "user"
# LLM settings
interpreter.llm.model = "ollama/codestral"
interpreter.llm.supports_functions = False
interpreter.llm.execution_instructions = False
interpreter.llm.load()
# Computer settings # Computer settings
interpreter.computer.import_computer_api = False interpreter.computer.import_computer_api = False
# Misc settings # Misc settings
interpreter.auto_run = False interpreter.auto_run = True
interpreter.offline = True interpreter.offline = True
interpreter.max_output = 600
# Final message # Final message
interpreter.display_message( interpreter.display_message(
f"> Model set to `{interpreter.llm.model}`\n\n**Open Interpreter** will require approval before running code.\n\nUse `interpreter -y` to bypass this.\n\nPress `CTRL-C` to exit.\n" "> Local model set to `Codestral`, Local TTS set to `Coqui`.\n"
) )

@ -5,7 +5,7 @@ import threading
import os import os
import importlib import importlib
from source.server.tunnel import create_tunnel from source.server.tunnel import create_tunnel
from source.server.async_server import main from source.server.async_server import start_server
import subprocess import subprocess
import signal import signal
@ -134,18 +134,14 @@ def _run(
signal.signal(signal.SIGINT, handle_exit) signal.signal(signal.SIGINT, handle_exit)
if server: if server:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
server_thread = threading.Thread( server_thread = threading.Thread(
target=loop.run_until_complete, target=start_server,
args=( args=(
main(
server_host, server_host,
server_port, server_port,
profile, profile,
debug, debug,
), ),
),
) )
server_thread.start() server_thread.start()

Loading…
Cancel
Save