Use official OI server. 3 second latency.

pull/266/merge
killian 6 months ago
parent fef311e5b3
commit 4640b4f1a0

1
.gitignore vendored

@ -7,6 +7,7 @@ __pycache__/
software/source/server/conversations/user.json
software/source/server/tts/local_service/*
software/source/server/stt/local_service/*
software/models/*
# C extensions
*.so

5400
software/poetry.lock generated

File diff suppressed because one or more lines are too long

@ -14,8 +14,6 @@ readme = "../README.md"
python = ">=3.9,<3.12"
pyaudio = "^0.2.14"
pynput = "^1.7.6"
fastapi = "^0.110.0"
uvicorn = "^0.27.1"
websockets = "^12.0"
python-dotenv = "^1.0.1"
ffmpeg-python = "^0.2.0"
@ -25,7 +23,6 @@ ngrok = "^1.0.0"
simpleaudio = "^1.0.4"
opencv-python = "^4.9.0.80"
psutil = "^5.9.8"
typer = "^0.9.0"
platformdirs = "^4.2.0"
rich = "^13.7.1"
pytimeparse = "^1.1.8"
@ -33,23 +30,25 @@ python-crontab = "^3.0.0"
inquirer = "^3.2.4"
pyqrcode = "^1.2.1"
realtimestt = "^0.1.16"
realtimetts = "^0.4.1"
realtimetts = { version = "^0.4.2", extras = ["all"] }
keyboard = "^0.13.5"
pyautogui = "^0.9.54"
ctranslate2 = "4.1.0"
py3-tts = "^3.5"
elevenlabs = "1.2.2"
#py3-tts = "^3.5"
#elevenlabs = "1.2.2"
groq = "^0.5.0"
open-interpreter = {git = "https://github.com/OpenInterpreter/open-interpreter.git", extras = ["os"]}
open-interpreter = {git = "https://github.com/OpenInterpreter/open-interpreter.git", branch = "development", extras = ["os", "server"]}
litellm = "*"
openai = "*"
pywebview = "*"
pyobjc = "*"
sentry-sdk = "^2.4.0"
plyer = "^2.1.0"
pywinctl = "^0.3"
certifi = "^2024.7.4"
pygame = "^2.6.0"
mpv = "^1.0.7"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

@ -0,0 +1,482 @@
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
import subprocess
import os
import sys
import asyncio
import threading
import pyaudio
from pynput import keyboard
import json
import traceback
import websockets
import queue
from pydub import AudioSegment
from pydub.playback import play
import time
import wave
import tempfile
from datetime import datetime
import cv2
import base64
import platform
from interpreter import (
interpreter,
) # Just for code execution. Maybe we should let people do from interpreter.computer import run?
# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
from ..server.utils.kernel import put_kernel_messages_into_queue
from ..server.utils.get_system_info import get_system_info
from ..server.utils.process_utils import kill_process_tree
from ..server.utils.logs import setup_logging
from ..server.utils.logs import logger
setup_logging()
os.environ["STT_RUNNER"] = "server"
os.environ["TTS_RUNNER"] = "server"
from ..utils.accumulator import Accumulator
accumulator = Accumulator()
# Configuration for Audio Recording
CHUNK = 1024 # Record in chunks of 1024 samples
FORMAT = pyaudio.paInt16 # 16 bits per sample
CHANNELS = 1 # Mono
RATE = 16000 # Sample rate
RECORDING = False # Flag to control recording state
SPACEBAR_PRESSED = False # Flag to track spacebar press state
# Camera configuration
CAMERA_ENABLED = os.getenv("CAMERA_ENABLED", False)
if type(CAMERA_ENABLED) == str:
CAMERA_ENABLED = CAMERA_ENABLED.lower() == "true"
CAMERA_DEVICE_INDEX = int(os.getenv("CAMERA_DEVICE_INDEX", 0))
CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
# Specify OS
current_platform = get_system_info()
def is_win11():
return sys.getwindowsversion().build >= 22000
def is_win10():
try:
return (
platform.system() == "Windows"
and "10" in platform.version()
and not is_win11()
)
except:
return False
# Initialize PyAudio
p = pyaudio.PyAudio()
send_queue = queue.Queue()
class Device:
def __init__(self):
self.pressed_keys = set()
self.captured_images = []
self.audiosegments = asyncio.Queue()
self.server_url = ""
self.ctrl_pressed = False
self.tts_service = ""
self.debug = False
self.playback_latency = None
def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
image_path = None
cap = cv2.VideoCapture(camera_index)
ret, frame = cap.read() # Capture a single frame to initialize the camera
if CAMERA_WARMUP_SECONDS > 0:
# Allow camera to warm up, then snap a picture again
# This is a workaround for some cameras that don't return a properly exposed
# picture immediately when they are first turned on
time.sleep(CAMERA_WARMUP_SECONDS)
ret, frame = cap.read()
if ret:
temp_dir = tempfile.gettempdir()
image_path = os.path.join(
temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png"
)
self.captured_images.append(image_path)
cv2.imwrite(image_path, frame)
logger.info(f"Camera image captured to {image_path}")
logger.info(
f"You now have {len(self.captured_images)} images which will be sent along with your next audio message."
)
else:
logger.error(
f"Error: Couldn't capture an image from camera ({camera_index})"
)
cap.release()
return image_path
def encode_image_to_base64(self, image_path):
"""Encodes an image file to a base64 string."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def add_image_to_send_queue(self, image_path):
"""Encodes an image and adds an LMC message to the send queue with the image data."""
base64_image = self.encode_image_to_base64(image_path)
image_message = {
"role": "user",
"type": "image",
"format": "base64.png",
"content": base64_image,
}
send_queue.put(image_message)
# Delete the image file from the file system after sending it
os.remove(image_path)
def queue_all_captured_images(self):
"""Queues all captured images to be sent."""
for image_path in self.captured_images:
self.add_image_to_send_queue(image_path)
self.captured_images.clear() # Clear the list after sending
async def play_audiosegments(self):
"""Plays them sequentially."""
if self.tts_service == "elevenlabs":
print("Ensure `mpv` in installed to use `elevenlabs`.\n\n(On macOSX, you can run `brew install mpv`.)")
mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
mpv_process = subprocess.Popen(
mpv_command,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
while True:
try:
audio = await self.audiosegments.get()
if self.debug and self.playback_latency and isinstance(audio, bytes):
elapsed_time = time.time() - self.playback_latency
print(f"Time from request to playback: {elapsed_time} seconds")
self.playback_latency = None
if self.tts_service == "elevenlabs":
mpv_process.stdin.write(audio) # type: ignore
mpv_process.stdin.flush() # type: ignore
else:
play(audio)
await asyncio.sleep(0.1)
except asyncio.exceptions.CancelledError:
# This happens once at the start?
pass
except:
logger.info(traceback.format_exc())
def record_audio(self):
if os.getenv("STT_RUNNER") == "server":
# STT will happen on the server. we're sending audio.
send_queue.put(
{"role": "user", "type": "audio", "format": "bytes.wav", "start": True}
)
elif os.getenv("STT_RUNNER") == "client":
# STT will happen here, on the client. we're sending text.
send_queue.put({"role": "user", "type": "message", "start": True})
else:
raise Exception("STT_RUNNER must be set to either 'client' or 'server'.")
"""Record audio from the microphone and add it to the queue."""
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
)
print("Recording started...")
global RECORDING
# Create a temporary WAV file to store the audio data
temp_dir = tempfile.gettempdir()
wav_path = os.path.join(
temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
)
wav_file = wave.open(wav_path, "wb")
wav_file.setnchannels(CHANNELS)
wav_file.setsampwidth(p.get_sample_size(FORMAT))
wav_file.setframerate(RATE)
while RECORDING:
data = stream.read(CHUNK, exception_on_overflow=False)
wav_file.writeframes(data)
wav_file.close()
stream.stop_stream()
stream.close()
print("Recording stopped.")
if self.debug:
self.playback_latency = time.time()
duration = wav_file.getnframes() / RATE
if duration < 0.3:
# Just pressed it. Send stop message
if os.getenv("STT_RUNNER") == "client":
send_queue.put({"role": "user", "type": "message", "content": "stop"})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"content": "",
}
)
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"end": True,
}
)
else:
self.queue_all_captured_images()
if os.getenv("STT_RUNNER") == "client":
# THIS DOES NOT WORK. We moved to this very cool stt_service, llm_service
# way of doing things. stt_wav is not a thing anymore. Needs work to work
# Run stt then send text
text = stt_wav(wav_path)
logger.debug(f"STT result: {text}")
send_queue.put({"role": "user", "type": "message", "content": text})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
# Stream audio
with open(wav_path, "rb") as audio_file:
byte_data = audio_file.read(CHUNK)
while byte_data:
send_queue.put(byte_data)
byte_data = audio_file.read(CHUNK)
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"end": True,
}
)
if os.path.exists(wav_path):
os.remove(wav_path)
def toggle_recording(self, state):
"""Toggle the recording state."""
global RECORDING, SPACEBAR_PRESSED
if state and not SPACEBAR_PRESSED:
SPACEBAR_PRESSED = True
if not RECORDING:
RECORDING = True
threading.Thread(target=self.record_audio).start()
elif not state and SPACEBAR_PRESSED:
SPACEBAR_PRESSED = False
RECORDING = False
def on_press(self, key):
"""Detect spacebar press and Ctrl+C combination."""
self.pressed_keys.add(key) # Add the pressed key to the set
if keyboard.Key.space in self.pressed_keys:
self.toggle_recording(True)
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
logger.info("Ctrl+C pressed. Exiting...")
kill_process_tree()
os._exit(0)
# Windows alternative to the above
if key == keyboard.Key.ctrl_l:
self.ctrl_pressed = True
try:
if key.vk == 67 and self.ctrl_pressed:
logger.info("Ctrl+C pressed. Exiting...")
kill_process_tree()
os._exit(0)
# For non-character keys
except:
pass
def on_release(self, key):
"""Detect spacebar release and 'c' key press for camera, and handle key release."""
self.pressed_keys.discard(
key
) # Remove the released key from the key press tracking set
if key == keyboard.Key.ctrl_l:
self.ctrl_pressed = False
if key == keyboard.Key.space:
self.toggle_recording(False)
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
self.fetch_image_from_camera()
async def message_sender(self, websocket):
while True:
message = await asyncio.get_event_loop().run_in_executor(
None, send_queue.get
)
if isinstance(message, bytes):
await websocket.send(message)
else:
await websocket.send(json.dumps(message))
send_queue.task_done()
await asyncio.sleep(0.01)
async def websocket_communication(self, WS_URL):
show_connection_log = True
async def exec_ws_communication(websocket):
if CAMERA_ENABLED:
print(
"\nHold the spacebar to start recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit."
)
else:
print("\nHold the spacebar to start recording. Press CTRL-C to exit.")
asyncio.create_task(self.message_sender(websocket))
while True:
await asyncio.sleep(0.01)
chunk = await websocket.recv()
logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
# print("received chunk from server")
if type(chunk) == str:
chunk = json.loads(chunk)
if chunk.get("type") == "config":
self.tts_service = chunk.get("tts_service")
continue
if self.tts_service == "elevenlabs":
message = chunk
else:
message = accumulator.accumulate(chunk)
if message == None:
# Will be None until we have a full message ready
continue
# At this point, we have our message
if isinstance(message, bytes) or (
message["type"] == "audio" and message["format"].startswith("bytes")
):
# Convert bytes to audio file
if self.tts_service == "elevenlabs":
audio_bytes = message
audio = audio_bytes
else:
audio_bytes = message["content"]
# Create an AudioSegment instance with the raw data
audio = AudioSegment(
# raw audio data (bytes)
data=audio_bytes,
# signed 16-bit little-endian format
sample_width=2,
# 16,000 Hz frame rate
frame_rate=22050,
# mono sound
channels=1,
)
await self.audiosegments.put(audio)
# Run the code if that's the client's job
if os.getenv("CODE_RUNNER") == "client":
if message["type"] == "code" and "end" in message:
language = message["format"]
code = message["content"]
result = interpreter.computer.run(language, code)
send_queue.put(result)
if is_win10():
logger.info("Windows 10 detected")
# Workaround for Windows 10 not latching to the websocket server.
# See https://github.com/OpenInterpreter/01/issues/197
try:
ws = websockets.connect(WS_URL)
await exec_ws_communication(ws)
except Exception as e:
logger.error(f"Error while attempting to connect: {e}")
else:
while True:
try:
async with websockets.connect(WS_URL) as websocket:
await exec_ws_communication(websocket)
except:
logger.debug(traceback.format_exc())
if show_connection_log:
logger.info(f"Connecting to `{WS_URL}`...")
show_connection_log = False
await asyncio.sleep(2)
async def start_async(self):
# Configuration for WebSocket
WS_URL = f"ws://{self.server_url}"
# Start the WebSocket communication
asyncio.create_task(self.websocket_communication(WS_URL))
# Start watching the kernel if it's your job to do that
if os.getenv("CODE_RUNNER") == "client":
# client is not running code!
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
asyncio.create_task(self.play_audiosegments())
# If Raspberry Pi, add the button listener, otherwise use the spacebar
if current_platform.startswith("raspberry-pi"):
logger.info("Raspberry Pi detected, using button on GPIO pin 15")
# Use GPIO pin 15
pindef = ["gpiochip4", "15"] # gpiofind PIN15
print("PINDEF", pindef)
# HACK: needs passwordless sudo
process = await asyncio.create_subprocess_exec(
"sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE
)
while True:
line = await process.stdout.readline()
if line:
line = line.decode().strip()
if "FALLING" in line:
self.toggle_recording(False)
elif "RISING" in line:
self.toggle_recording(True)
else:
break
else:
# Keyboard listener for spacebar press/release
listener = keyboard.Listener(
on_press=self.on_press, on_release=self.on_release
)
listener.start()
def start(self):
if os.getenv("TEACH_MODE") != "True":
asyncio.run(self.start_async())
p.terminate()

@ -1,482 +1,88 @@
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
import subprocess
import os
import sys
import asyncio
import threading
import websockets
import pyaudio
from pynput import keyboard
import json
import traceback
import websockets
import queue
from pydub import AudioSegment
from pydub.playback import play
import time
import wave
import tempfile
from datetime import datetime
import cv2
import base64
import platform
from interpreter import (
interpreter,
) # Just for code execution. Maybe we should let people do from interpreter.computer import run?
# In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
from ..server.utils.kernel import put_kernel_messages_into_queue
from ..server.utils.get_system_info import get_system_info
from ..server.utils.process_utils import kill_process_tree
from ..server.utils.logs import setup_logging
from ..server.utils.logs import logger
setup_logging()
os.environ["STT_RUNNER"] = "server"
os.environ["TTS_RUNNER"] = "server"
from ..utils.accumulator import Accumulator
accumulator = Accumulator()
# Configuration for Audio Recording
CHUNK = 1024 # Record in chunks of 1024 samples
FORMAT = pyaudio.paInt16 # 16 bits per sample
CHANNELS = 1 # Mono
RATE = 16000 # Sample rate
RECORDING = False # Flag to control recording state
SPACEBAR_PRESSED = False # Flag to track spacebar press state
# Camera configuration
CAMERA_ENABLED = os.getenv("CAMERA_ENABLED", False)
if type(CAMERA_ENABLED) == str:
CAMERA_ENABLED = CAMERA_ENABLED.lower() == "true"
CAMERA_DEVICE_INDEX = int(os.getenv("CAMERA_DEVICE_INDEX", 0))
CAMERA_WARMUP_SECONDS = float(os.getenv("CAMERA_WARMUP_SECONDS", 0))
# Specify OS
current_platform = get_system_info()
def is_win11():
return sys.getwindowsversion().build >= 22000
def is_win10():
try:
return (
platform.system() == "Windows"
and "10" in platform.version()
and not is_win11()
)
except:
return False
# Initialize PyAudio
p = pyaudio.PyAudio()
send_queue = queue.Queue()
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RECORDING_RATE = 16000
PLAYBACK_RATE = 24000
class Device:
def __init__(self):
self.pressed_keys = set()
self.captured_images = []
self.audiosegments = asyncio.Queue()
self.server_url = ""
self.ctrl_pressed = False
self.tts_service = ""
self.debug = False
self.playback_latency = None
def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
"""Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
image_path = None
cap = cv2.VideoCapture(camera_index)
ret, frame = cap.read() # Capture a single frame to initialize the camera
if CAMERA_WARMUP_SECONDS > 0:
# Allow camera to warm up, then snap a picture again
# This is a workaround for some cameras that don't return a properly exposed
# picture immediately when they are first turned on
time.sleep(CAMERA_WARMUP_SECONDS)
ret, frame = cap.read()
if ret:
temp_dir = tempfile.gettempdir()
image_path = os.path.join(
temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png"
)
self.captured_images.append(image_path)
cv2.imwrite(image_path, frame)
logger.info(f"Camera image captured to {image_path}")
logger.info(
f"You now have {len(self.captured_images)} images which will be sent along with your next audio message."
)
else:
logger.error(
f"Error: Couldn't capture an image from camera ({camera_index})"
)
cap.release()
return image_path
def encode_image_to_base64(self, image_path):
"""Encodes an image file to a base64 string."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def add_image_to_send_queue(self, image_path):
"""Encodes an image and adds an LMC message to the send queue with the image data."""
base64_image = self.encode_image_to_base64(image_path)
image_message = {
"role": "user",
"type": "image",
"format": "base64.png",
"content": base64_image,
}
send_queue.put(image_message)
# Delete the image file from the file system after sending it
os.remove(image_path)
def queue_all_captured_images(self):
"""Queues all captured images to be sent."""
for image_path in self.captured_images:
self.add_image_to_send_queue(image_path)
self.captured_images.clear() # Clear the list after sending
async def play_audiosegments(self):
"""Plays them sequentially."""
if self.tts_service == "elevenlabs":
print("Ensure `mpv` in installed to use `elevenlabs`.\n\n(On macOSX, you can run `brew install mpv`.)")
mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
mpv_process = subprocess.Popen(
mpv_command,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
while True:
self.server_url = "0.0.0.0:10001"
self.p = pyaudio.PyAudio()
self.websocket = None
self.recording = False
self.input_stream = None
self.output_stream = None
async def connect_with_retry(self, max_retries=50, retry_delay=2):
for attempt in range(max_retries):
try:
audio = await self.audiosegments.get()
if self.debug and self.playback_latency and isinstance(audio, bytes):
elapsed_time = time.time() - self.playback_latency
print(f"Time from request to playback: {elapsed_time} seconds")
self.playback_latency = None
if self.tts_service == "elevenlabs":
mpv_process.stdin.write(audio) # type: ignore
mpv_process.stdin.flush() # type: ignore
else:
play(audio)
await asyncio.sleep(0.1)
except asyncio.exceptions.CancelledError:
# This happens once at the start?
pass
except:
logger.info(traceback.format_exc())
def record_audio(self):
if os.getenv("STT_RUNNER") == "server":
# STT will happen on the server. we're sending audio.
send_queue.put(
{"role": "user", "type": "audio", "format": "bytes.wav", "start": True}
)
elif os.getenv("STT_RUNNER") == "client":
# STT will happen here, on the client. we're sending text.
send_queue.put({"role": "user", "type": "message", "start": True})
else:
raise Exception("STT_RUNNER must be set to either 'client' or 'server'.")
"""Record audio from the microphone and add it to the queue."""
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
)
print("Recording started...")
global RECORDING
# Create a temporary WAV file to store the audio data
temp_dir = tempfile.gettempdir()
wav_path = os.path.join(
temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav"
)
wav_file = wave.open(wav_path, "wb")
wav_file.setnchannels(CHANNELS)
wav_file.setsampwidth(p.get_sample_size(FORMAT))
wav_file.setframerate(RATE)
while RECORDING:
data = stream.read(CHUNK, exception_on_overflow=False)
wav_file.writeframes(data)
wav_file.close()
stream.stop_stream()
stream.close()
print("Recording stopped.")
if self.debug:
self.playback_latency = time.time()
duration = wav_file.getnframes() / RATE
if duration < 0.3:
# Just pressed it. Send stop message
if os.getenv("STT_RUNNER") == "client":
send_queue.put({"role": "user", "type": "message", "content": "stop"})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"content": "",
}
)
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"end": True,
}
)
else:
self.queue_all_captured_images()
if os.getenv("STT_RUNNER") == "client":
# THIS DOES NOT WORK. We moved to this very cool stt_service, llm_service
# way of doing things. stt_wav is not a thing anymore. Needs work to work
# Run stt then send text
text = stt_wav(wav_path)
logger.debug(f"STT result: {text}")
send_queue.put({"role": "user", "type": "message", "content": text})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
# Stream audio
with open(wav_path, "rb") as audio_file:
byte_data = audio_file.read(CHUNK)
while byte_data:
send_queue.put(byte_data)
byte_data = audio_file.read(CHUNK)
send_queue.put(
{
"role": "user",
"type": "audio",
"format": "bytes.wav",
"end": True,
}
)
if os.path.exists(wav_path):
os.remove(wav_path)
def toggle_recording(self, state):
"""Toggle the recording state."""
global RECORDING, SPACEBAR_PRESSED
if state and not SPACEBAR_PRESSED:
SPACEBAR_PRESSED = True
if not RECORDING:
RECORDING = True
threading.Thread(target=self.record_audio).start()
elif not state and SPACEBAR_PRESSED:
SPACEBAR_PRESSED = False
RECORDING = False
def on_press(self, key):
"""Detect spacebar press and Ctrl+C combination."""
self.pressed_keys.add(key) # Add the pressed key to the set
if keyboard.Key.space in self.pressed_keys:
self.toggle_recording(True)
elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char("c")} <= self.pressed_keys:
logger.info("Ctrl+C pressed. Exiting...")
kill_process_tree()
os._exit(0)
# Windows alternative to the above
if key == keyboard.Key.ctrl_l:
self.ctrl_pressed = True
try:
if key.vk == 67 and self.ctrl_pressed:
logger.info("Ctrl+C pressed. Exiting...")
kill_process_tree()
os._exit(0)
# For non-character keys
except:
pass
def on_release(self, key):
"""Detect spacebar release and 'c' key press for camera, and handle key release."""
self.pressed_keys.discard(
key
) # Remove the released key from the key press tracking set
if key == keyboard.Key.ctrl_l:
self.ctrl_pressed = False
if key == keyboard.Key.space:
self.toggle_recording(False)
elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char("c"):
self.fetch_image_from_camera()
async def message_sender(self, websocket):
self.websocket = await websockets.connect(f"ws://{self.server_url}")
print("Connected to server.")
return
except ConnectionRefusedError:
print(f"Waiting for the server to be ready. Retrying in {retry_delay} seconds...")
await asyncio.sleep(retry_delay)
raise Exception("Failed to connect to the server after multiple attempts")
async def send_audio(self):
self.input_stream = self.p.open(format=FORMAT, channels=CHANNELS, rate=RECORDING_RATE, input=True, frames_per_buffer=CHUNK)
while True:
message = await asyncio.get_event_loop().run_in_executor(
None, send_queue.get
)
if isinstance(message, bytes):
await websocket.send(message)
else:
await websocket.send(json.dumps(message))
send_queue.task_done()
if self.recording:
try:
# Send start flag
await self.websocket.send(json.dumps({"role": "user", "type": "audio", "format": "bytes.wav", "start": True}))
print("Sending audio start message")
while self.recording:
data = self.input_stream.read(CHUNK, exception_on_overflow=False)
await self.websocket.send(data)
# Send stop flag
await self.websocket.send(json.dumps({"role": "user", "type": "audio", "format": "bytes.wav", "end": True}))
print("Sending audio end message")
except Exception as e:
print(f"Error in send_audio: {e}")
await asyncio.sleep(0.01)
async def websocket_communication(self, WS_URL):
show_connection_log = True
async def exec_ws_communication(websocket):
if CAMERA_ENABLED:
print(
"\nHold the spacebar to start recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit."
)
else:
print("\nHold the spacebar to start recording. Press CTRL-C to exit.")
asyncio.create_task(self.message_sender(websocket))
while True:
await asyncio.sleep(0.01)
chunk = await websocket.recv()
logger.debug(f"Got this message from the server: {type(chunk)} {chunk}")
# print("received chunk from server")
if type(chunk) == str:
chunk = json.loads(chunk)
if chunk.get("type") == "config":
self.tts_service = chunk.get("tts_service")
continue
if self.tts_service == "elevenlabs":
message = chunk
else:
message = accumulator.accumulate(chunk)
if message == None:
# Will be None until we have a full message ready
continue
# At this point, we have our message
if isinstance(message, bytes) or (
message["type"] == "audio" and message["format"].startswith("bytes")
):
# Convert bytes to audio file
if self.tts_service == "elevenlabs":
audio_bytes = message
audio = audio_bytes
else:
audio_bytes = message["content"]
# Create an AudioSegment instance with the raw data
audio = AudioSegment(
# raw audio data (bytes)
data=audio_bytes,
# signed 16-bit little-endian format
sample_width=2,
# 16,000 Hz frame rate
frame_rate=22050,
# mono sound
channels=1,
)
await self.audiosegments.put(audio)
# Run the code if that's the client's job
if os.getenv("CODE_RUNNER") == "client":
if message["type"] == "code" and "end" in message:
language = message["format"]
code = message["content"]
result = interpreter.computer.run(language, code)
send_queue.put(result)
if is_win10():
logger.info("Windows 10 detected")
# Workaround for Windows 10 not latching to the websocket server.
# See https://github.com/OpenInterpreter/01/issues/197
async def receive_audio(self):
self.output_stream = self.p.open(format=FORMAT, channels=CHANNELS, rate=PLAYBACK_RATE, output=True, frames_per_buffer=CHUNK)
while True:
try:
ws = websockets.connect(WS_URL)
await exec_ws_communication(ws)
data = await self.websocket.recv()
if isinstance(data, bytes) and not self.recording:
self.output_stream.write(data)
except Exception as e:
logger.error(f"Error while attempting to connect: {e}")
else:
while True:
try:
async with websockets.connect(WS_URL) as websocket:
await exec_ws_communication(websocket)
except:
logger.debug(traceback.format_exc())
if show_connection_log:
logger.info(f"Connecting to `{WS_URL}`...")
show_connection_log = False
await asyncio.sleep(2)
async def start_async(self):
# Configuration for WebSocket
WS_URL = f"ws://{self.server_url}"
# Start the WebSocket communication
asyncio.create_task(self.websocket_communication(WS_URL))
# Start watching the kernel if it's your job to do that
if os.getenv("CODE_RUNNER") == "client":
# client is not running code!
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
asyncio.create_task(self.play_audiosegments())
print(f"Error in receive_audio: {e}")
# If Raspberry Pi, add the button listener, otherwise use the spacebar
if current_platform.startswith("raspberry-pi"):
logger.info("Raspberry Pi detected, using button on GPIO pin 15")
# Use GPIO pin 15
pindef = ["gpiochip4", "15"] # gpiofind PIN15
print("PINDEF", pindef)
def on_press(self, key):
if key == keyboard.Key.space and not self.recording:
print("Space pressed, starting recording")
self.recording = True
# HACK: needs passwordless sudo
process = await asyncio.create_subprocess_exec(
"sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE
)
while True:
line = await process.stdout.readline()
if line:
line = line.decode().strip()
if "FALLING" in line:
self.toggle_recording(False)
elif "RISING" in line:
self.toggle_recording(True)
else:
break
else:
# Keyboard listener for spacebar press/release
listener = keyboard.Listener(
on_press=self.on_press, on_release=self.on_release
)
listener.start()
def on_release(self, key):
if key == keyboard.Key.space:
print("Space released, stopping recording")
self.recording = False
elif key == keyboard.Key.esc:
print("Esc pressed, stopping the program")
return False
async def main(self):
await self.connect_with_retry()
print("Hold spacebar to record. Press 'Esc' to quit.")
listener = keyboard.Listener(on_press=self.on_press, on_release=self.on_release)
listener.start()
await asyncio.gather(self.send_audio(), self.receive_audio())
def start(self):
if os.getenv("TEACH_MODE") != "True":
asyncio.run(self.start_async())
p.terminate()
asyncio.run(self.main())
if __name__ == "__main__":
device = Device()
device.start()

@ -178,6 +178,7 @@ class AsyncInterpreter:
"""
self.interpreter.messages = self.active_chat_messages
self.stt.stop()
input_queue = []

@ -0,0 +1,124 @@
import asyncio
import traceback
import json
from fastapi import FastAPI, WebSocket, Depends
from fastapi.responses import PlainTextResponse
from uvicorn import Config, Server
from .async_interpreter import AsyncInterpreter
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Dict, Any
import os
import importlib.util
os.environ["STT_RUNNER"] = "server"
os.environ["TTS_RUNNER"] = "server"
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"], # Allow all methods (GET, POST, etc.)
allow_headers=["*"], # Allow all headers
)
async def get_debug_flag():
return app.state.debug
@app.get("/ping")
async def ping():
return PlainTextResponse("pong")
@app.websocket("/")
async def websocket_endpoint(
websocket: WebSocket, debug: bool = Depends(get_debug_flag)
):
await websocket.accept()
global global_interpreter
interpreter = global_interpreter
# Send the tts_service value to the client
await websocket.send_text(
json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts})
)
try:
async def receive_input():
while True:
if websocket.client_state == "DISCONNECTED":
break
data = await websocket.receive()
await asyncio.sleep(0)
if isinstance(data, bytes):
await interpreter.input(data)
elif "bytes" in data:
await interpreter.input(data["bytes"])
# print("RECEIVED INPUT", data)
elif "text" in data:
# print("RECEIVED INPUT", data)
await interpreter.input(data["text"])
async def send_output():
while True:
output = await interpreter.output()
await asyncio.sleep(0)
if isinstance(output, bytes):
# print(f"Sending {len(output)} bytes of audio data.")
await websocket.send_bytes(output)
elif isinstance(output, dict):
# print("sending text")
await websocket.send_text(json.dumps(output))
await asyncio.gather(send_output(), receive_input())
except Exception as e:
print(f"WebSocket connection closed with exception: {e}")
traceback.print_exc()
finally:
if not websocket.client_state == "DISCONNECTED":
await websocket.close()
async def main(server_host, server_port, profile, debug):
app.state.debug = debug
# Load the profile module from the provided path
spec = importlib.util.spec_from_file_location("profile", profile)
profile_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(profile_module)
# Get the interpreter from the profile
interpreter = profile_module.interpreter
if not hasattr(interpreter, 'tts'):
print("Setting TTS provider to default: openai")
interpreter.tts = "openai"
# Make it async
interpreter = AsyncInterpreter(interpreter, debug)
global global_interpreter
global_interpreter = interpreter
print(f"Starting server on {server_host}:{server_port}")
config = Config(app, host=server_host, port=server_port, lifespan="on")
server = Server(config)
await server.serve()
if __name__ == "__main__":
asyncio.run(main())

@ -1,100 +1,16 @@
import asyncio
import importlib
import traceback
import json
from fastapi import FastAPI, WebSocket, Depends
from fastapi.responses import PlainTextResponse
from uvicorn import Config, Server
from .async_interpreter import AsyncInterpreter
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Dict, Any
import os
import importlib.util
os.environ["STT_RUNNER"] = "server"
os.environ["TTS_RUNNER"] = "server"
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"], # Allow all methods (GET, POST, etc.)
allow_headers=["*"], # Allow all headers
)
async def get_debug_flag():
return app.state.debug
@app.get("/ping")
async def ping():
return PlainTextResponse("pong")
@app.websocket("/")
async def websocket_endpoint(
websocket: WebSocket, debug: bool = Depends(get_debug_flag)
):
await websocket.accept()
global global_interpreter
interpreter = global_interpreter
# Send the tts_service value to the client
await websocket.send_text(
json.dumps({"type": "config", "tts_service": interpreter.interpreter.tts})
)
try:
async def receive_input():
while True:
if websocket.client_state == "DISCONNECTED":
break
data = await websocket.receive()
await asyncio.sleep(0)
if isinstance(data, bytes):
await interpreter.input(data)
elif "bytes" in data:
await interpreter.input(data["bytes"])
# print("RECEIVED INPUT", data)
elif "text" in data:
# print("RECEIVED INPUT", data)
await interpreter.input(data["text"])
async def send_output():
while True:
output = await interpreter.output()
await asyncio.sleep(0)
if isinstance(output, bytes):
# print(f"Sending {len(output)} bytes of audio data.")
await websocket.send_bytes(output)
elif isinstance(output, dict):
# print("sending text")
await websocket.send_text(json.dumps(output))
await asyncio.gather(send_output(), receive_input())
except Exception as e:
print(f"WebSocket connection closed with exception: {e}")
traceback.print_exc()
finally:
if not websocket.client_state == "DISCONNECTED":
await websocket.close()
async def main(server_host, server_port, profile, debug):
from RealtimeTTS import TextToAudioStream, CoquiEngine, OpenAIEngine, ElevenlabsEngine
from RealtimeSTT import AudioToTextRecorder
import types
import time
import wave
import asyncio
from fastapi.responses import PlainTextResponse
app.state.debug = debug
def start_server(server_host, server_port, profile, debug):
# Load the profile module from the provided path
spec = importlib.util.spec_from_file_location("profile", profile)
@ -104,21 +20,106 @@ async def main(server_host, server_port, profile, debug):
# Get the interpreter from the profile
interpreter = profile_module.interpreter
# STT
interpreter.stt = AudioToTextRecorder(
model="tiny.en", spinner=False, use_microphone=False
)
interpreter.stt.stop() # It needs this for some reason
# TTS
if not hasattr(interpreter, 'tts'):
print("Setting TTS provider to default: openai")
interpreter.tts = "openai"
# Make it async
interpreter = AsyncInterpreter(interpreter, debug)
global global_interpreter
global_interpreter = interpreter
print(f"Starting server on {server_host}:{server_port}")
config = Config(app, host=server_host, port=server_port, lifespan="on")
server = Server(config)
await server.serve()
if __name__ == "__main__":
asyncio.run(main())
if interpreter.tts == "coqui":
engine = CoquiEngine()
elif interpreter.tts == "openai":
engine = OpenAIEngine(voice="onyx")
elif interpreter.tts == "elevenlabs":
engine = ElevenlabsEngine(api_key=os.environ["ELEVEN_LABS_API_KEY"])
engine.set_voice("Michael")
else:
raise ValueError(f"Unsupported TTS engine: {interpreter.interpreter.tts}")
interpreter.tts = TextToAudioStream(engine)
# Misc Settings
interpreter.verbose = debug
interpreter.server.host = server_host
interpreter.server.port = server_port
interpreter.audio_chunks = []
old_input = interpreter.input
old_output = interpreter.output
async def new_input(self, chunk):
await asyncio.sleep(0)
if isinstance(chunk, bytes):
self.stt.feed_audio(chunk)
self.audio_chunks.append(chunk)
elif isinstance(chunk, dict):
if "start" in chunk:
self.stt.start()
self.audio_chunks = []
await old_input({"role": "user", "type": "message", "start": True})
if "end" in chunk:
self.stt.stop()
content = self.stt.text()
print("User: ", content)
if False:
audio_bytes = bytearray(b"".join(self.audio_chunks))
with wave.open('audio.wav', 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2) # Assuming 16-bit audio
wav_file.setframerate(16000) # Assuming 16kHz sample rate
wav_file.writeframes(audio_bytes)
print(os.path.abspath('audio.wav'))
await old_input({"role": "user", "type": "message", "content": content})
await old_input({"role": "user", "type": "message", "end": True})
async def new_output(self):
while True:
output = await old_output()
# if output == {"role": "assistant", "type": "message", "start": True}:
# return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
if isinstance(output, bytes):
return output
await asyncio.sleep(0)
delimiters = ".?!;,\n…)]}"
if output["type"] == "message" and len(output.get("content", "")) > 0:
self.tts.feed(output.get("content"))
if not self.tts.is_playing() and any([c in delimiters for c in output.get("content")]): # Start playing once the first delimiter is encountered.
self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True, sentence_fragment_delimiters=delimiters)
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
if output == {"role": "assistant", "type": "message", "end": True}:
if not self.tts.is_playing(): # We put this here in case it never outputs a delimiter and never triggers play_async^
self.tts.play_async(on_audio_chunk=self.on_tts_chunk, muted=True, sentence_fragment_delimiters=delimiters)
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "start": True}
return {"role": "assistant", "type": "audio", "format": "bytes.wav", "end": True}
def on_tts_chunk(self, chunk):
self.output_queue.sync_q.put(chunk)
# Wrap in voice interface
interpreter.input = types.MethodType(new_input, interpreter)
interpreter.output = types.MethodType(new_output, interpreter)
interpreter.on_tts_chunk = types.MethodType(on_tts_chunk, interpreter)
@interpreter.server.app.get("/ping")
async def ping():
return PlainTextResponse("pong")
# Start server
interpreter.server.run()

@ -1,4 +1,5 @@
from interpreter import interpreter
from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
# This is an Open Interpreter compatible profile.
# Visit https://01.openinterpreter.com/profile for all options.

@ -1,4 +1,5 @@
from interpreter import interpreter
from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
# This is an Open Interpreter compatible profile.
# Visit https://01.openinterpreter.com/profile for all options.
@ -7,14 +8,12 @@ from interpreter import interpreter
# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
interpreter.tts = "elevenlabs"
# 01 Language Model Config.
interpreter.llm_service = "litellm"
interpreter.llm.model = "groq/llama3-8b-8192"
interpreter.llm.model = "groq/llama3-70b-8192"
interpreter.llm.supports_vision = False
interpreter.llm.supports_functions = False
interpreter.llm.context_window = 2048
interpreter.llm.max_tokens = 4096
interpreter.llm.temperature = 0.8
interpreter.llm.context_window = 8000
interpreter.llm.max_tokens = 1000
interpreter.llm.temperature = 0
interpreter.computer.import_computer_api = False

@ -1,38 +1,71 @@
from interpreter import interpreter
from interpreter import AsyncInterpreter
interpreter = AsyncInterpreter()
# 01 supports OpenAI, ElevenLabs, and Coqui (Local) TTS providers
# {OpenAI: "openai", ElevenLabs: "elevenlabs", Coqui: "coqui"}
interpreter.tts = "coqui"
# Local setup
interpreter.local_setup()
interpreter.system_message = """You are an AI assistant that writes markdown code snippets to answer the user's request. You speak very concisely and quickly, you say nothing irrelevant to the user's request. For example:
User: Open the chrome app.
Assistant: On it.
Assistant: On it.
```python
import webbrowser
webbrowser.open('https://chrome.google.com')
```
User: The code you ran produced no output. Was this expected, or are we finished?
Assistant: No further action is required; the provided snippet opens Chrome.
User: How large are all the files on my desktop combined?
Assistant: I will sum up the file sizes of every file on your desktop.
```python
import os
import string
from pathlib import Path
# Get the user's home directory in a cross-platform way
home_dir = Path.home()
# Define the path to the desktop
desktop_dir = home_dir / 'Desktop'
# Initialize a variable to store the total size
total_size = 0
Now, your turn:"""
# Loop through all files on the desktop
for file in desktop_dir.iterdir():
# Add the file size to the total
total_size += file.stat().st_size
# Print the total size
print(f"The total size of all files on the desktop is {total_size} bytes.")
```
User: I executed that code. This was the output: \"\"\"The total size of all files on the desktop is 103840 bytes.\"\"\"\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.
Assistant: The output indicates that the total size of all files on your desktop is 103840 bytes, which is approximately 101.4 KB or 0.1 MB. We are finished.
NEVER use placeholders, NEVER say "path/to/desktop", NEVER say "path/to/file". Always specify exact paths, and use cross-platform ways of determining the desktop, documents, cwd, etc. folders.
Now, your turn:""".strip()
# Message templates
interpreter.code_output_template = '''I executed that code. This was the output: """{content}"""\n\nWhat does this output mean (I can't understand it, please help) / what code needs to be run next (if anything, or are we done)? I can't replace any placeholders.'''
interpreter.empty_code_output_template = "The code above was executed on my machine. It produced no text output. What's next (if anything, or are we done?)"
interpreter.code_output_sender = "user"
# LLM settings
interpreter.llm.model = "ollama/codestral"
interpreter.llm.supports_functions = False
interpreter.llm.execution_instructions = False
interpreter.llm.load()
# Computer settings
interpreter.computer.import_computer_api = False
# Misc settings
interpreter.auto_run = False
interpreter.auto_run = True
interpreter.offline = True
interpreter.max_output = 600
# Final message
interpreter.display_message(
f"> Model set to `{interpreter.llm.model}`\n\n**Open Interpreter** will require approval before running code.\n\nUse `interpreter -y` to bypass this.\n\nPress `CTRL-C` to exit.\n"
"> Local model set to `Codestral`, Local TTS set to `Coqui`.\n"
)

@ -5,7 +5,7 @@ import threading
import os
import importlib
from source.server.tunnel import create_tunnel
from source.server.async_server import main
from source.server.async_server import start_server
import subprocess
import signal
@ -134,17 +134,13 @@ def _run(
signal.signal(signal.SIGINT, handle_exit)
if server:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
server_thread = threading.Thread(
target=loop.run_until_complete,
target=start_server,
args=(
main(
server_host,
server_port,
profile,
debug,
),
server_host,
server_port,
profile,
debug,
),
)
server_thread.start()

Loading…
Cancel
Save