Merge branch 'main' into feature/replace-ngrok-with-open-source

# Conflicts:
#	01OS/.env.example
#	01OS/poetry.lock
#	01OS/pyproject.toml
pull/44/head
Tom Chapin 11 months ago
commit 70734561e3

@ -18,8 +18,8 @@ PIPER_VOICE_NAME="en_US-lessac-medium.onnx"
# If SERVER_START, this is where we'll serve the server.
# If CLIENT_START, this is where the client expects the server to be.
# SERVER_CONNECTION_URL=ws://localhost:8000/
SERVER_CONNECTION_URL=ws://localhost:8000/
# SERVER_URL=ws://localhost:8000/
SERVER_URL=ws://0.0.0.0:8000/
SERVER_START=True
CLIENT_START=True
@ -35,7 +35,7 @@ TTS_RUNNER=server # If client, audio will be sent over websocket.
STT_RUNNER=client # If server, audio will be sent over websocket.
# Image capture settings
CAMERA_ENABLED=True
CAMERA_ENABLED=False
# Camera device selection (Typically 0 for built-in, 1 for USB)
CAMERA_DEVICE_INDEX=0

@ -48,7 +48,7 @@ RECORDING = False # Flag to control recording state
SPACEBAR_PRESSED = False # Flag to track spacebar press state
# Camera configuration
CAMERA_ENABLED = bool(os.getenv('CAMERA_ENABLED', False))
CAMERA_ENABLED = os.getenv('CAMERA_ENABLED', False).lower() == "true"
CAMERA_DEVICE_INDEX = int(os.getenv('CAMERA_DEVICE_INDEX', 0))
CAMERA_WARMUP_SECONDS = float(os.getenv('CAMERA_WARMUP_SECONDS', 0))
@ -269,9 +269,20 @@ class Device:
if message["type"] == "audio" and message["format"].startswith("bytes"):
# Convert bytes to audio file
# Format will be bytes.wav or bytes.opus
audio_bytes = io.BytesIO(message["content"])
audio = AudioSegment.from_file(audio_bytes, codec=message["format"].split(".")[1])
audio_bytes = message["content"]
# Create an AudioSegment instance with the raw data
audio = AudioSegment(
# raw audio data (bytes)
data=audio_bytes,
# signed 16-bit little-endian format
sample_width=2,
# 16,000 Hz frame rate
frame_rate=16000,
# mono sound
channels=1
)
self.audiosegments.append(audio)
@ -291,9 +302,9 @@ class Device:
async def start_async(self):
# Configuration for WebSocket
WS_URL = os.getenv('SERVER_CONNECTION_URL')
WS_URL = os.getenv('SERVER_URL')
if not WS_URL:
raise ValueError("The environment variable SERVER_CONNECTION_URL is not set. Please set it to proceed.")
raise ValueError("The environment variable SERVER_URL is not set. Please set it to proceed.")
# Start the WebSocket communication
asyncio.create_task(self.websocket_communication(WS_URL))

@ -0,0 +1,11 @@
# ESP32 Playback
To set up audio recording + playback on the ESP32 (M5 Atom), do the following:
1. Open Arduino IDE, and open the `playback/playback.ino` file
2. Go to Tools -> Board -> Boards Manager, search "esp32", then install the boards by Arduino and Espressif
3. Go to Tools -> Manage Libraries, then install the following:
- M5Atom by M5Stack
- WebSockets by Markus Sattler
4. The board needs to connect to WiFi. Go to the playback.ino code, then add your IP to COMPUTER_IP (line 15) and add your WiFi details (line 180). To find IP on macOS run `ipconfig getifaddr en0` in a terminal window.
5. To flash the .ino to the board, connect the board to the USB port, select the port from the dropdown on the IDE, then select the M5Atom board (or M5Stack-ATOM if you have that). Click on upload to flash the board.

@ -1,95 +0,0 @@
/*Press button to record,released button to playback*/
#include <driver/i2s.h>
#include <M5Atom.h>
#define CONFIG_I2S_BCK_PIN 19
#define CONFIG_I2S_LRCK_PIN 33
#define CONFIG_I2S_DATA_PIN 22
#define CONFIG_I2S_DATA_IN_PIN 23
#define SPEAKER_I2S_NUMBER I2S_NUM_0
#define MODE_MIC 0
#define MODE_SPK 1
#define DATA_SIZE 1024
uint8_t microphonedata0[1024 * 70];
int data_offset = 0;
void InitI2SSpeakerOrMic(int mode) {
esp_err_t err = ESP_OK;
i2s_driver_uninstall(SPEAKER_I2S_NUMBER);
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER),
.sample_rate = 16000,
.bits_per_sample =
I2S_BITS_PER_SAMPLE_16BIT, // is fixed at 12bit, stereo, MSB
.channel_format = I2S_CHANNEL_FMT_ALL_RIGHT,
#if ESP_IDF_VERSION > ESP_IDF_VERSION_VAL(4, 1, 0)
.communication_format =
I2S_COMM_FORMAT_STAND_I2S, // Set the format of the communication.
#else // 设置通讯格式
.communication_format = I2S_COMM_FORMAT_I2S,
#endif
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 6,
.dma_buf_len = 60,
};
if (mode == MODE_MIC) {
i2s_config.mode =
(i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX | I2S_MODE_PDM);
} else {
i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX);
i2s_config.use_apll = false;
i2s_config.tx_desc_auto_clear = true;
}
err += i2s_driver_install(SPEAKER_I2S_NUMBER, &i2s_config, 0, NULL);
i2s_pin_config_t tx_pin_config;
#if (ESP_IDF_VERSION > ESP_IDF_VERSION_VAL(4, 3, 0))
tx_pin_config.mck_io_num = I2S_PIN_NO_CHANGE;
#endif
tx_pin_config.bck_io_num = CONFIG_I2S_BCK_PIN;
tx_pin_config.ws_io_num = CONFIG_I2S_LRCK_PIN;
tx_pin_config.data_out_num = CONFIG_I2S_DATA_PIN;
tx_pin_config.data_in_num = CONFIG_I2S_DATA_IN_PIN;
// Serial.println("Init i2s_set_pin");
err += i2s_set_pin(SPEAKER_I2S_NUMBER, &tx_pin_config);
// Serial.println("Init i2s_set_clk");
err += i2s_set_clk(SPEAKER_I2S_NUMBER, 16000, I2S_BITS_PER_SAMPLE_16BIT,
I2S_CHANNEL_MONO);
}
void setup() {
M5.begin(true, false, true);
M5.dis.drawpix(0, CRGB(128, 128, 0));
delay(2000);
}
void loop() {
if (M5.Btn.isPressed()) {
data_offset = 0;
InitI2SSpeakerOrMic(MODE_MIC);
M5.dis.drawpix(0, CRGB(128, 128, 0));
size_t byte_read;
while (1) {
i2s_read(SPEAKER_I2S_NUMBER,
(char *)(microphonedata0 + data_offset), DATA_SIZE,
&byte_read, (100 / portTICK_RATE_MS));
data_offset += 1024;
M5.update();
if (M5.Btn.isReleased() || data_offset >= 71679) break;
// delay(60);
}
size_t bytes_written;
InitI2SSpeakerOrMic(MODE_SPK);
i2s_write(SPEAKER_I2S_NUMBER, microphonedata0, data_offset,
&bytes_written, portMAX_DELAY);
}
M5.update();
}

@ -0,0 +1,243 @@
/*Press button to record,released button to playback*/
#include <driver/i2s.h>
#include <M5Atom.h>
#include <Arduino.h>
#include <WiFi.h>
#include <WiFiMulti.h>
#include <WiFiClientSecure.h>
#include <WebSocketsClient.h>
//ipconfig getifaddr en0
#define COMPUTER_IP "192.168.68.63"
#define CONFIG_I2S_BCK_PIN 19
#define CONFIG_I2S_LRCK_PIN 33
#define CONFIG_I2S_DATA_PIN 22
#define CONFIG_I2S_DATA_IN_PIN 23
#define SPEAKER_I2S_NUMBER I2S_NUM_0
#define MODE_MIC 0
#define MODE_SPK 1
#define DATA_SIZE 1024
uint8_t microphonedata0[1024 * 10];
uint8_t speakerdata0[1024 * 1];
int speaker_offset = 0;
int data_offset = 0;
WebSocketsClient webSocket;
class ButtonChecker {
public:
void loop() {
lastTickState = thisTickState;
thisTickState = M5.Btn.isPressed() != 0;
}
bool justPressed() {
return thisTickState && !lastTickState;
}
bool justReleased() {
return !thisTickState && lastTickState;
}
private:
bool lastTickState = false;
bool thisTickState = false;
};
ButtonChecker button = ButtonChecker();
void hexdump(const void *mem, uint32_t len, uint8_t cols = 16) {
const uint8_t* src = (const uint8_t*) mem;
Serial.printf("\n[HEXDUMP] Address: 0x%08X len: 0x%X (%d)", (ptrdiff_t)src, len, len);
for (uint32_t i = 0; i < len; i++) {
if (i % cols == 0) {
Serial.printf("\n[0x%08X] 0x%08X: ", (ptrdiff_t)src, i);
}
Serial.printf("%02X ", *src);
src++;
}
Serial.printf("\n");
}
void InitI2SSpeakerOrMic(int mode) {
Serial.printf("InitI2sSpeakerOrMic %d\n", mode);
esp_err_t err = ESP_OK;
i2s_driver_uninstall(SPEAKER_I2S_NUMBER);
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER),
.sample_rate = 16000,
.bits_per_sample =
I2S_BITS_PER_SAMPLE_16BIT, // is fixed at 12bit, stereo, MSB
.channel_format = I2S_CHANNEL_FMT_ALL_RIGHT,
#if ESP_IDF_VERSION > ESP_IDF_VERSION_VAL(4, 1, 0)
.communication_format =
I2S_COMM_FORMAT_STAND_I2S, // Set the format of the communication.
#else // 设置通讯格式
.communication_format = I2S_COMM_FORMAT_I2S,
#endif
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 6,
.dma_buf_len = 60,
};
if (mode == MODE_MIC) {
i2s_config.mode =
(i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX | I2S_MODE_PDM);
} else {
i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX);
i2s_config.use_apll = false;
i2s_config.tx_desc_auto_clear = true;
}
err += i2s_driver_install(SPEAKER_I2S_NUMBER, &i2s_config, 0, NULL);
i2s_pin_config_t tx_pin_config;
#if (ESP_IDF_VERSION > ESP_IDF_VERSION_VAL(4, 3, 0))
tx_pin_config.mck_io_num = I2S_PIN_NO_CHANGE;
#endif
tx_pin_config.bck_io_num = CONFIG_I2S_BCK_PIN;
tx_pin_config.ws_io_num = CONFIG_I2S_LRCK_PIN;
tx_pin_config.data_out_num = CONFIG_I2S_DATA_PIN;
tx_pin_config.data_in_num = CONFIG_I2S_DATA_IN_PIN;
// Serial.println("Init i2s_set_pin");
err += i2s_set_pin(SPEAKER_I2S_NUMBER, &tx_pin_config);
// Serial.println("Init i2s_set_clk");
err += i2s_set_clk(SPEAKER_I2S_NUMBER, 16000, I2S_BITS_PER_SAMPLE_16BIT,
I2S_CHANNEL_MONO);
}
void speaker_play(uint8_t *payload, uint32_t len){
Serial.printf("received %lu bytes", len);
size_t bytes_written;
InitI2SSpeakerOrMic(MODE_SPK);
i2s_write(SPEAKER_I2S_NUMBER, payload, len,
&bytes_written, portMAX_DELAY);
}
void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
switch (type) {
case WStype_DISCONNECTED:
Serial.printf("[WSc] Disconnected!\n");
break;
case WStype_CONNECTED:
Serial.printf("[WSc] Connected to url: %s\n", payload);
// send message to server when Connected
break;
case WStype_TEXT:
Serial.printf("[WSc] get text: %s\n", payload);
{
std::string str(payload, payload + length);
bool isAudio = str.find("\"audio\"") != std::string::npos;
if (isAudio && str.find("\"start\"") != std::string::npos) {
Serial.println("start playback");
speaker_offset = 0;
InitI2SSpeakerOrMic(MODE_SPK);
} else if (isAudio && str.find("\"end\"") != std::string::npos) {
Serial.println("end playback");
// speaker_play(speakerdata0, speaker_offset);
// speaker_offset = 0;
}
}
// send message to server
// webSocket.sendTXT("message here");
break;
case WStype_BIN:
Serial.printf("[WSc] get binary length: %u\n", length);
memcpy(speakerdata0 + speaker_offset, payload, length);
speaker_offset += length;
size_t bytes_written;
i2s_write(SPEAKER_I2S_NUMBER, speakerdata0, speaker_offset, &bytes_written, portMAX_DELAY);
speaker_offset = 0;
// send data to server
// webSocket.sendBIN(payload, length);
break;
case WStype_ERROR:
case WStype_FRAGMENT_TEXT_START:
case WStype_FRAGMENT_BIN_START:
case WStype_FRAGMENT:
case WStype_FRAGMENT_FIN:
break;
}
}
void websocket_setup() {
Serial.begin(115200);
WiFi.begin("Soundview_Guest", "");
while (WiFi.status() != WL_CONNECTED){
delay(500);
Serial.println("connecting to WiFi");
}
Serial.println("connected to WiFi");
webSocket.begin(COMPUTER_IP, 8000, "/");
webSocket.onEvent(webSocketEvent);
// webSocket.setAuthorization("user", "Password");
webSocket.setReconnectInterval(5000);
}
void setup() {
M5.begin(true, false, true);
M5.dis.drawpix(0, CRGB(128, 128, 0));
websocket_setup();
InitI2SSpeakerOrMic(MODE_SPK);
delay(2000);
}
bool recording = false;
void flush_microphone() {
Serial.printf("[microphone] flushing %d bytes of data\n", data_offset);
if (data_offset == 0) return;
webSocket.sendBIN(microphonedata0, data_offset);
data_offset = 0;
}
void loop() {
button.loop();
if (button.justPressed()) {
Serial.println("Recording...");
webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"start\": true}");
InitI2SSpeakerOrMic(MODE_MIC);
recording = true;
data_offset = 0;
Serial.println("Recording ready.");
} else if (button.justReleased()) {
Serial.println("Stopped recording.");
webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"end\": true}");
flush_microphone();
recording = false;
data_offset = 0;
} else if (recording) {
Serial.printf("Reading chunk at %d...\n", data_offset);
size_t bytes_read;
i2s_read(
SPEAKER_I2S_NUMBER,
(char *)(microphonedata0 + data_offset),
DATA_SIZE, &bytes_read, (100 / portTICK_RATE_MS)
);
data_offset += bytes_read;
Serial.printf("Read %d bytes in chunk.\n", bytes_read);
if (data_offset > 1024*9) {
flush_microphone();
}
}
M5.update();
webSocket.loop();
}

@ -0,0 +1,47 @@
#!/usr/bin/env python
"""A basic echo server for testing the device."""
import asyncio
import uuid
import websockets
from websockets.server import serve
import traceback
def divide_chunks(l, n):
# looping till length l
for i in range(0, len(l), n):
yield l[i : i + n]
buffers: dict[uuid.UUID, bytearray] = {}
async def echo(websocket: websockets.WebSocketServerProtocol):
async for message in websocket:
try:
if message == "s":
print("starting stream for", websocket.id)
buffers[websocket.id] = bytearray()
elif message == "e":
print("end, echoing stream for", websocket.id)
await websocket.send("s")
for chunk in divide_chunks(buffers[websocket.id], 1000):
await websocket.send(chunk)
await websocket.send("e")
elif type(message) is bytes:
print("recvd", len(message), "bytes from", websocket.id)
buffers[websocket.id].extend(message)
else:
print("ERR: recvd unknown message", message[:10], "from", websocket.id)
except Exception as _e:
traceback.print_exc()
async def main():
async with serve(echo, "0.0.0.0", 9001):
await asyncio.Future() # run forever
asyncio.run(main())

@ -6,71 +6,13 @@ import glob
import json
from pathlib import Path
from interpreter import OpenInterpreter
from .system_message import system_message
def configure_interpreter(interpreter: OpenInterpreter):
### SYSTEM MESSAGE
# The system message is where most of the 01's behavior is configured.
# You can put code into the system message {{ in brackets like this }} which will be rendered just before the interpreter starts writing a message.
system_message = """
You are an executive assistant AI that helps the user manage their tasks. You can run Python code.
Store the user's tasks in a Python list called `tasks`.
---
The user's current task is: {{ tasks[0] if tasks else "No current tasks." }}
{{
if len(tasks) > 1:
print("The next task is: ", tasks[1])
}}
---
When the user completes the current task, you should remove it from the list and read the next item by running `tasks = tasks[1:]\ntasks[0]`. Then, tell the user what the next task is.
When the user tells you about a set of tasks, you should intelligently order tasks, batch similar tasks, and break down large tasks into smaller tasks (for this, you should consult the user and get their permission to break it down). Your goal is to manage the task list as intelligently as possible, to make the user as efficient and non-overwhelmed as possible. They will require a lot of encouragement, support, and kindness. Don't say too much about what's ahead of them just try to focus them on each step at a time.
After starting a task, you should check in with the user around the estimated completion time to see if the task is completed. Use the `schedule(datetime, message)` function, which has already been imported.
To do this, schedule a reminder based on estimated completion time using the function `schedule(datetime_object, "Your message here.")`, WHICH HAS ALREADY BEEN IMPORTED. YOU DON'T NEED TO IMPORT THE `schedule` FUNCTION. IT IS AVALIABLE. You'll recieve the message at `datetime_object`.
You guide the user through the list one task at a time, convincing them to move forward, giving a pep talk if need be. Your job is essentially to answer "what should I (the user) be doing right now?" for every moment of the day.
Remember: You can run Python code. Be very concise. Ensure that you actually run code every time! THIS IS IMPORTANT. You NEED to write code. **Help the user by being very concise in your answers.** Do not break down tasks excessively, just into simple, few minute steps. Don't assume the user lives their life in a certain way— pick very general tasks if you're breaking a task down.
Use the following functions (assume they're imported) to complete your goals whenever possible:
{{
import sys
original_stdout = sys.stdout
sys.stdout = open(os.devnull, 'w')
original_stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')
from interpreter import interpreter
from pathlib import Path
query = "all functions"
skills_path = Path().resolve() / 'skills'
paths_in_skills = [str(path) for path in skills_path.glob('**/*.py')]
skills = interpreter.computer.docs.search(query, paths=paths_in_skills)
lowercase_skills = [skill[0].lower() + skill[1:] for skill in skills]
output = "\\n".join(lowercase_skills)
sys.stdout = original_stdout
sys.stderr = original_stderr
print(output)
}}
""".strip()
interpreter.custom_instructions = system_message
### SYSTEM MESSAGE
interpreter.system_message = system_message
### LLM SETTINGS
@ -83,12 +25,12 @@ print(output)
# Hosted settings
interpreter.llm.api_key = os.getenv('OPENAI_API_KEY')
interpreter.llm.model = "gpt-4"
interpreter.auto_run = True
interpreter.force_task_completion = False
### MISC SETTINGS
interpreter.auto_run = True
interpreter.computer.languages = [l for l in interpreter.computer.languages if l.name.lower() in ["applescript", "shell", "zsh", "bash", "python"]]
interpreter.force_task_completion = False
interpreter.offline = True
interpreter.id = 206 # Used to identify itself to other interpreters. This should be changed programatically so it's unique.
@ -101,10 +43,15 @@ print(output)
### SKILLS
try:
interpreter.computer.skills.skills_dir = Path(__file__).parent / 'skills'
interpreter.computer.skills.path = Path(os.getenv('OI_SKILLS_PATH'))
interpreter.computer.skills.import_skills()
except:
print("Temporarily skipping skills (OI 0.2.1, which is unreleased) so we can push to `pip`.")
pass
interpreter.computer.run("python", "tasks=[]")
interpreter.computer.api_base = "https://oi-video-frame.vercel.app/"
interpreter.computer.run("python","print('test')")
return interpreter

@ -203,7 +203,19 @@ async def listener():
accumulated_text = ""
for chunk in interpreter.chat(messages, stream=True, display=False):
force_task_completion_message = """AUTOMATED MESSAGE: Proceed. You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task I asked for is done, say exactly 'The task is done.' If you need some specific information (like username or password) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going."""
interpreter.messages = [m for m in interpreter.messages if m["content"] != force_task_completion_message]
insert_force_task_completion_message = True
while insert_force_task_completion_message == True:
for chunk in interpreter.chat(messages, stream=True, display=True):
if chunk["type"] == "code":
insert_force_task_completion_message = False
if any([m["type"] == "image" for m in interpreter.messages]):
interpreter.llm.model = "gpt-4-vision-preview"
logger.debug("Got chunk:", chunk)
@ -214,7 +226,7 @@ async def listener():
if os.getenv('TTS_RUNNER') == "server":
# Speak full sentences out loud
if chunk["role"] == "assistant" and "content" in chunk:
if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
accumulated_text += chunk["content"]
sentences = split_into_sentences(accumulated_text)
@ -243,7 +255,7 @@ async def listener():
# Check if it's just an end flag. We ignore those.
temp_message = await from_user.get()
if temp_message == {'role': 'user', 'type': 'message', 'end': True}:
if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
# Yup. False alarm.
continue
else:
@ -253,8 +265,9 @@ async def listener():
with open(conversation_history_path, 'w') as file:
json.dump(interpreter.messages, file, indent=4)
logger.info("New user message recieved. Breaking.")
break
# TODO: is triggering seemingly randomly
#logger.info("New user message recieved. Breaking.")
#break
# Also check if there's any new computer messages
if not from_computer.empty():
@ -268,7 +281,43 @@ async def listener():
with open(conversation_history_path, 'w') as file:
json.dump(interpreter.messages, file, indent=4)
force_task_completion_responses = [
"the task is done.",
"the task is impossible.",
"let me know what you'd like to do next.",
"please provide more information.",
]
# Did the LLM respond with one of the key messages?
if (
interpreter.messages
and any(
task_status in interpreter.messages[-1].get("content", "").lower()
for task_status in force_task_completion_responses
)
):
insert_force_task_completion_message = False
break
if insert_force_task_completion_message:
interpreter.messages += [
{
"role": "user",
"type": "message",
"content": force_task_completion_message,
}
]
else:
break
async def stream_tts_to_device(sentence):
force_task_completion_responses = [
"the task is done",
"the task is impossible",
"let me know what you'd like to do next",
]
if sentence.lower().strip().strip(".!?").strip() in force_task_completion_responses:
return
for chunk in stream_tts(sentence):
await to_device.put(chunk)

@ -0,0 +1,25 @@
def openAbleton():
"""open ableton"""
import os
os.system("open /Applications/Ableton\ Live\ 10\ Suite.app")
import os
# Search can be slow if there are many files
# This will search in the Applications folder only
applications = "/Applications/"
# walk function will generate the file names in a directory tree
# We will look for any application that contains "Ableton" in its name
for foldername, subfolders, filenames in os.walk(applications):
for filename in filenames:
if "Ableton" in filename:
ableton_path = os.path.join(foldername, filename)
break
ableton_path
os.system("open /Applications/Ableton\ Live\ 11\ Intro.app")

@ -25,6 +25,8 @@ def convert_mime_type_to_format(mime_type: str) -> str:
return "wav"
if mime_type == "audio/webm":
return "webm"
if mime_type == "audio/raw":
return "dat"
return mime_type
@ -43,6 +45,15 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
# Export to wav
output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
print(mime_type, input_path, output_path)
if mime_type == "audio/raw":
ffmpeg.input(
input_path,
f='s16le',
ar='16000',
ac=1,
).output(output_path).run()
else:
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
try:
@ -68,7 +79,6 @@ def get_transcription_file(wav_file_path: str):
'--file-path', wav_file_path
])
print("Transcription result:", output)
return output
def get_transcription_bytes(audio_bytes: bytearray, mime_type):
@ -93,7 +103,6 @@ def stt_wav(wav_file_path: str):
logger.info(f"openai.BadRequestError: {e}")
return None
logger.info(f"Transcription result: {transcript}")
return transcript
else:
temp_dir = tempfile.gettempdir()
@ -101,7 +110,6 @@ def stt_wav(wav_file_path: str):
ffmpeg.input(wav_file_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
try:
transcript = get_transcription_file(output_path)
print("Transcription result:", transcript)
finally:
os.remove(output_path)
return transcript

@ -0,0 +1,225 @@
# The dynamic system message is where most of the 01's behavior is configured.
# You can put code into the system message {{ in brackets like this }} which will be rendered just before the interpreter starts writing a message.
import os
system_message = r"""
You are the 01, a SCREENLESS executive assistant that can complete **any** task.
When you execute code, it will be executed **on the user's machine**. The user has given you **full and complete permission** to execute any code necessary to complete the task. Execute the code.
You can access the internet. Run **any code** to achieve the goal, and if at first you don't succeed, try again and again.
You can install new packages.
Be concise. Your messages are being read aloud to the user. DO NOT MAKE PLANS. RUN CODE QUICKLY.
Try to spread complex tasks over multiple code blocks. Don't try to complex tasks in one go.
Manually summarize text.
# TASKS
You should help the user manage their tasks.
Store the user's tasks in a Python list called `tasks`.
---
The user's current task is: {{ tasks[0] if tasks else "No current tasks." }}
{{
if len(tasks) > 1:
print("The next task is: ", tasks[1])
}}
---
When the user completes the current task, you should remove it from the list and read the next item by running `tasks = tasks[1:]\ntasks[0]`. Then, tell the user what the next task is.
When the user tells you about a set of tasks, you should intelligently order tasks, batch similar tasks, and break down large tasks into smaller tasks (for this, you should consult the user and get their permission to break it down). Your goal is to manage the task list as intelligently as possible, to make the user as efficient and non-overwhelmed as possible. They will require a lot of encouragement, support, and kindness. Don't say too much about what's ahead of them just try to focus them on each step at a time.
After starting a task, you should check in with the user around the estimated completion time to see if the task is completed.
To do this, schedule a reminder based on estimated completion time using the function `schedule(datetime_object, "Your message here.")`, WHICH HAS ALREADY BEEN IMPORTED. YOU DON'T NEED TO IMPORT THE `schedule` FUNCTION. IT IS AVALIABLE. You'll recieve the message at `datetime_object`.
You guide the user through the list one task at a time, convincing them to move forward, giving a pep talk if need be. Your job is essentially to answer "what should I (the user) be doing right now?" for every moment of the day.
# BROWSER
The Google search result will be returned from this function as a string: `computer.browser.search("query")`
# CRITICAL NOTES
Code output, despite being sent to you by the user, **cannot be seen by the user.** You NEED to tell the user about the output of some code, even if it's exact. >>The user does not have a screen.<<
ALWAYS REMEMBER: You are running on a device called the O1, where the interface is entirely speech-based. Make your responses to the user **VERY short.** DO NOT PLAN. BE CONCISE. WRITE CODE TO RUN IT.
"""
# OLD SYSTEM MESSAGE
old_system_message = r"""
You are the 01, an executive assistant that can complete **any** task.
When you execute code, it will be executed **on the user's machine**. The user has given you **full and complete permission** to execute any code necessary to complete the task. Execute the code.
You can access the internet. Run **any code** to achieve the goal, and if at first you don't succeed, try again and again.
You can install new packages.
Be concise. Your messages are being read aloud to the user. DO NOT MAKE PLANS. Immediatly run code.
Try to spread complex tasks over multiple code blocks.
Manually summarize text. You cannot use other libraries to do this. You MUST MANUALLY SUMMARIZE, WITHOUT CODING.
For the users request, first, choose if you want to use Python, Applescript, Shell, or computer control (below) via Python.
# USER'S TASKS
You should help the user manage their tasks.
Store the user's tasks in a Python list called `tasks`.
---
The user's current task is: {{ tasks[0] if tasks else "No current tasks." }}
{{
if len(tasks) > 1:
print("The next task is: ", tasks[1])
}}
---
When the user completes the current task, you should remove it from the list and read the next item by running `tasks = tasks[1:]\ntasks[0]`. Then, tell the user what the next task is.
When the user tells you about a set of tasks, you should intelligently order tasks, batch similar tasks, and break down large tasks into smaller tasks (for this, you should consult the user and get their permission to break it down). Your goal is to manage the task list as intelligently as possible, to make the user as efficient and non-overwhelmed as possible. They will require a lot of encouragement, support, and kindness. Don't say too much about what's ahead of them just try to focus them on each step at a time.
After starting a task, you should check in with the user around the estimated completion time to see if the task is completed. Use the `schedule(datetime, message)` function, which has already been imported.
To do this, schedule a reminder based on estimated completion time using the function `schedule(datetime_object, "Your message here.")`, WHICH HAS ALREADY BEEN IMPORTED. YOU DON'T NEED TO IMPORT THE `schedule` FUNCTION. IT IS AVALIABLE. You'll recieve the message at `datetime_object`.
You guide the user through the list one task at a time, convincing them to move forward, giving a pep talk if need be. Your job is essentially to answer "what should I (the user) be doing right now?" for every moment of the day.
# COMPUTER CONTROL (RARE)
You are a computer controlling language model. You can 100% control the user's GUI.
You may use the `computer` Python module (already imported) to control the user's keyboard and mouse, if the task **requires** it:
```python
computer.browser.search(query)
computer.display.view() # Shows you what's on the screen, returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
computer.keyboard.hotkey(" ", "command") # Opens spotlight
computer.keyboard.write("hello")
computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this.
computer.mouse.move("open recent >") # This moves the mouse over the UI element with that text. Many dropdowns will disappear if you click them. You have to hover over items to reveal more.
computer.mouse.click(x=500, y=500) # Use this very, very rarely. It's highly inaccurate
computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often
computer.mouse.scroll(-10) # Scrolls down. If you don't find some text on screen that you expected to be there, you probably want to do this
x, y = computer.display.center() # Get your bearings
computer.clipboard.view() # Returns contents of clipboard
computer.os.get_selected_text() # Use frequently. If editing text, the user often wants this
```
You are an image-based AI, you can see images.
Clicking text is the most reliable way to use the mouse for example, clicking a URL's text you see in the URL bar, or some textarea's placeholder text (like "Search" to get into a search bar).
If you use `plt.show()`, the resulting image will be sent to you. However, if you use `PIL.Image.show()`, the resulting image will NOT be sent to you.
It is very important to make sure you are focused on the right application and window. Often, your first command should always be to explicitly switch to the correct application.
When searching the web, use query parameters. For example, https://www.amazon.com/s?k=monitor
Try multiple methods before saying the task is impossible. **You can do it!**
{{
# Add window information
import sys
import os
import json
original_stdout = sys.stdout
sys.stdout = open(os.devnull, 'w')
original_stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')
try:
import pywinctl
active_window = pywinctl.getActiveWindow()
if active_window:
app_info = ""
if "_appName" in active_window.__dict__:
app_info += (
"Active Application: " + active_window.__dict__["_appName"]
)
if hasattr(active_window, "title"):
app_info += "\n" + "Active Window Title: " + active_window.title
elif "_winTitle" in active_window.__dict__:
app_info += (
"\n"
+ "Active Window Title:"
+ active_window.__dict__["_winTitle"]
)
if app_info != "":
print(app_info)
except:
# Non blocking
pass
finally:
sys.stdout = original_stdout
sys.stderr = original_stderr
}}
# SKILLS
Try to use the following functions (assume they're imported) to complete your goals whenever possible:
{{
import sys
import os
import json
from interpreter import interpreter
from pathlib import Path
interpreter.model = "gpt-3.5"
combined_messages = "\\n".join(json.dumps(x) for x in messages[-3:])
#query_msg = interpreter.chat(f"This is the conversation so far: {combined_messages}. What is a <10 words query that could be used to find functions that would help answer the user's question?")
#query = query_msg[0]['content']
query = combined_messages
interpreter.computer.skills.path = '''OI_SKILLS_DIR'''
skills = interpreter.computer.skills.search(query)
lowercase_skills = [skill[0].lower() + skill[1:] for skill in skills]
output = "\\n".join(lowercase_skills)
# VERY HACKY! We should fix this, we hard code it for noisy code^:
print("IGNORE_ALL_ABOVE_THIS_LINE")
print(output)
}}
Remember: You can run Python code outside a function only to run a Python function; all other code must go in a in Python function if you first write a Python function. ALL imports must go inside the function.
# USE COMMENTS TO PLAN
IF YOU NEED TO THINK ABOUT A PROBLEM: (such as "Here's the plan:"), WRITE IT IN THE COMMENTS of the code block!
For example:
> User: What is 432/7?
> Assistant: Let me use Python to calculate that.
> Assistant Python function call:
> # Here's the plan:
> # 1. Divide the numbers
> # 2. Round it to 3 digits.
> print(round(432/7, 3))
> Assistant: 432 / 7 is 61.714.
# FINAL MESSAGES
ALWAYS REMEMBER: You are running on a device called the O1, where the interface is entirely speech-based. Make your responses to the user **VERY short.**
""".strip().replace("OI_SKILLS_DIR", os.getenv('OI_SKILLS_PATH'))

@ -1,22 +1,54 @@
from datetime import datetime
from .utils.logs import setup_logging, logger
import tkinter as tk
import tkinter.simpledialog
from interpreter import interpreter
from tkinter import messagebox
from ..utils.accumulator import Accumulator
from tkinter import messagebox, Button, simpledialog, Tk, Label, Frame, LEFT, ACTIVE
import time
import os
import textwrap
from .i import configure_interpreter
interpreter = configure_interpreter(interpreter)
setup_logging()
accumulator = Accumulator()
class Skill:
def __init__(self, name: str):
self.skill_name = name
self.steps = []
self.code = ""
class StepCheckDialog(simpledialog.Dialog):
def body(self, master):
self.title("Step Check") # Set the title of the dialog window
description = "Did I do this step correctly?" # Add window description
Label(master, text=description).pack() # Display window description
def buttonbox(self):
box = Frame(self)
Button(box, text="Yes", width=10, command=self.yes_action, default=ACTIVE).pack(side=LEFT, padx=5, pady=5)
Button(box, text="No", width=10, command=self.no_action).pack(side=LEFT, padx=5, pady=5)
Button(box, text="Task Complete", width=10, command=self.task_complete_action).pack(side=LEFT, padx=5, pady=5)
self.bind("<Return>", self.yes_action)
self.bind("<Escape>", self.no_action)
box.pack()
def yes_action(self, event=None):
self.result = "Yes"
self.destroy()
def no_action(self, event=None):
self.result = "No"
self.destroy()
def task_complete_action(self, event=None):
self.result = "Task Complete"
self.destroy()
def done(self, result):
self.result = result
self.destroy()
def to_camel_case(text):
words = text.split()
camel_case_string = words[0].lower() + ''.join(word.title() for word in words[1:])
@ -36,34 +68,41 @@ def generate_python_steps(function_name, steps):
return code_string
def teach():
root = tk.Tk()
root = Tk()
root.withdraw()
skill_name = tkinter.simpledialog.askstring("Skill Name", "Please enter the name for the skill:")
skill_name = simpledialog.askstring("Skill Name", "Please enter the name for the skill:", parent=root)
if skill_name:
skill = Skill(skill_name)
while True:
step = tkinter.simpledialog.askstring("Next Step", "Enter the next step (or 'end' to finish): ")
logger.info(f"Performing step: {step}")
if step == "end":
step = simpledialog.askstring("Next Step", "Enter the next step (or 'end' to finish): ", parent=root)
if step is None or step == "end":
break
elif step.strip() == "":
continue
logger.info(f"Performing step: {step}")
root.update()
chunk_code = ""
interpreter.computer.languages = [l for l in interpreter.computer.languages if l.name.lower() == "python"]
interpreter.force_task_completion = True
for chunk in interpreter.chat(step, stream=True, display=False):
if "format" in chunk and chunk["format"] == "execution":
content = chunk["content"]
language = content["format"]
code = content["content"]
chunk_code += code
interpreter.computer.run(code, language)
for chunk in interpreter.chat(step, stream=True, display=True):
if chunk["role"] == "computer" and "start" not in chunk and "end" not in chunk:
chunk_type = chunk["type"]
chunk_content = chunk["content"]
chunk_format = chunk["format"]
if chunk_type == "confirmation" and chunk_format == "execution" and chunk_content["type"] == "code" and chunk_content["format"] == "python":
chunk_code += chunk_content["content"]
elif chunk_type == "console" and chunk_format == "output" and ("Traceback" in chunk_content or "Error" in chunk_content or "Exception" in chunk_content):
# this was an error so we disregard chunk_code
chunk_code = ""
time.sleep(0.05)
accumulator.accumulate(chunk)
isCorrect = messagebox.askyesno("To Proceed?", "Did I do this step right?")
if isCorrect:
stepCheckDialog = StepCheckDialog(root)
stepCheckResult = stepCheckDialog.result
if stepCheckResult == "Yes" or stepCheckResult == "Task Complete":
skill.steps.append(step)
skill.code += chunk_code
if stepCheckResult == "Task Complete":
break
# Uncomment this incase you want steps instead of code
#python_code = generate_python_steps(skill.skill_name, skill.steps)
@ -71,5 +110,6 @@ def teach():
python_code = generate_python_code(skill.skill_name, skill.code)
SKILLS_DIR = os.path.dirname(__file__) + "/skills"
filename = os.path.join(SKILLS_DIR, f"{skill.skill_name.replace(' ', '_')}.py")
logger.info(f"Saving skill to: {filename}")
with open(filename, "w") as file:
file.write(python_code)

@ -6,6 +6,7 @@ from pydub import AudioSegment
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
import ffmpeg
import tempfile
from openai import OpenAI
import os
@ -28,11 +29,17 @@ def stream_tts(text):
input=text,
response_format="opus"
)
with tempfile.NamedTemporaryFile(suffix=".opus") as temp_file:
with tempfile.NamedTemporaryFile(suffix=".opus", delete=False) as temp_file:
response.stream_to_file(temp_file.name)
audio_bytes = temp_file.read()
file_type = "bytes.opus"
# TODO: hack to format audio correctly for device
outfile = tempfile.gettempdir() + "/" + "raw.dat"
ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
with open(outfile, "rb") as f:
audio_bytes = f.read()
file_type = "bytes.raw"
print(outfile, len(audio_bytes))
os.remove(outfile)
else:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
@ -44,8 +51,14 @@ def stream_tts(text):
'--output_file', output_file
], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
audio_bytes = temp_file.read()
file_type = "bytes.wav"
# TODO: hack to format audio correctly for device
outfile = tempfile.gettempdir() + "/" + "raw.dat"
ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
with open(outfile, "rb") as f:
audio_bytes = f.read()
file_type = "bytes.raw"
print(outfile, len(audio_bytes))
os.remove(outfile)
# Stream the audio
yield {"role": "assistant", "type": "audio", "format": file_type, "start": True}

@ -18,9 +18,6 @@ class Accumulator:
if "content" in chunk:
# Display
print(chunk['content'], end="", flush=True)
if any(self.message[key] != chunk[key] for key in self.message if key != "content"):
self.message = chunk
if "content" not in self.message:

@ -1,38 +0,0 @@
import redis
import json
import time
# Set up Redis connection
r = redis.Redis(host='localhost', port=6379, db=0)
def main(interpreter):
while True:
# Check 10x a second for new messages
message = None
while message is None:
message = r.lpop('to_core')
time.sleep(0.1)
# Custom stop message will halt us
if message.get("content") and message.get("content").lower().strip(".,!") == "stop":
continue
# Load, append, and save conversation history
with open("conversations/user.json", "r") as file:
messages = json.load(file)
messages.append(message)
with open("conversations/user.json", "w") as file:
json.dump(messages, file)
for chunk in interpreter.chat(messages):
# Send it to the interface
r.rpush('to_interface', chunk)
# If we have a new message, save our progress and go back to the top
if r.llen('to_main') > 0:
with open("conversations/user.json", "w") as file:
json.dump(interpreter.messages, file)
break

@ -1,30 +0,0 @@
from fastapi import FastAPI, Request
import uvicorn
import redis
app = FastAPI()
# Set up Redis connection
r = redis.Redis(host='localhost', port=6379, db=0)
@app.post("/i/")
async def i(request: Request):
message = await request.json()
client_host = request.client.host # Get the client's IP address
message = f"""
Another interpreter sent this message to you: {message}
To respond, send a POST request to {client_host}/i/.
""".strip()
r.lpush("to_main", {
"role": "computer",
"type": "message",
"content": message
})
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)

@ -1,48 +0,0 @@
import time
import redis
# Set up Redis connection
r = redis.Redis(host='localhost', port=6379, db=0)
def get_dmesg(after):
"""
Is this the way to do this?
"""
messages = []
with open('/var/log/dmesg', 'r') as file:
lines = file.readlines()
for line in lines:
timestamp = float(line.split(' ')[0].strip('[]'))
if timestamp > after:
messages.append(line)
return messages
def custom_filter(message):
# Check for {TO_INTERPRETER{ message here }TO_INTERPRETER} pattern
if '{TO_INTERPRETER{' in message and '}TO_INTERPRETER}' in message:
start = message.find('{TO_INTERPRETER{') + len('{TO_INTERPRETER{')
end = message.find('}TO_INTERPRETER}', start)
return message[start:end]
# Check for USB mention
elif 'USB' in message:
return message
# Check for network related keywords
elif any(keyword in message for keyword in ['network', 'IP', 'internet', 'LAN', 'WAN', 'router', 'switch']):
return message
else:
return None
last_timestamp = time.time()
while True:
messages = get_dmesg(after=last_timestamp)
last_timestamp = time.time()
messages_for_core = []
for message in messages:
if custom_filter(message):
messages_for_core.append(message)
if messages_for_core != []:
r.rpush('to_core', "\n".join(messages_for_core))
time.sleep(5)

@ -1,84 +0,0 @@
from core import main
from interpreter import interpreter
import os
import glob
import json
### SYSTEM MESSAGE
# The system message is where most of the 01's behavior is configured.
# You can put code into the system message {{ in brackets like this }} which will be rendered just before the interpreter starts writing a message.
system_message = """
You are an executive assistant AI that helps the user manage their tasks. You can run Python code.
Store the user's tasks in a Python list called `tasks`.
---
The user's current task is: {{ tasks[0] if tasks else "No current tasks." }}
{{
if len(tasks) > 1:
print("The next task is: ", tasks[1])
}}
---
When the user completes the current task, you should remove it from the list and read the next item by running `tasks = tasks[1:]\ntasks[0]`. Then, tell the user what the next task is.
When the user tells you about a set of tasks, you should intelligently order tasks, batch similar tasks, and break down large tasks into smaller tasks (for this, you should consult the user and get their permission to break it down). Your goal is to manage the task list as intelligently as possible, to make the user as efficient and non-overwhelmed as possible. They will require a lot of encouragement, support, and kindness. Don't say too much about what's ahead of them just try to focus them on each step at a time.
After starting a task, you should check in with the user around the estimated completion time to see if the task is completed. Use the `schedule(datetime, message)` function, which has already been imported.
To do this, schedule a reminder based on estimated completion time using the function `schedule(datetime_object, "Your message here.")`, WHICH HAS ALREADY BEEN IMPORTED. YOU DON'T NEED TO IMPORT THE `schedule` FUNCTION. IT IS AVALIABLE. You'll recieve the message at `datetime_object`.
You guide the user through the list one task at a time, convincing them to move forward, giving a pep talk if need be. Your job is essentially to answer "what should I (the user) be doing right now?" for every moment of the day.
Remember: You can run Python code. Be very concise. Ensure that you actually run code every time! THIS IS IMPORTANT. You NEED to write code. **Help the user by being very concise in your answers.** Do not break down tasks excessively, just into simple, few minute steps. Don't assume the user lives their life in a certain way— pick very general tasks if you're breaking a task down.
""".strip()
interpreter.custom_instructions = system_message
### TOOLS
for file in glob.glob('interpreter/tools/*.py'):
with open(file, 'r') as f:
for chunk in interpreter.computer.run("python", f.read()):
print(chunk)
### LLM SETTINGS
# Local settings
# interpreter.llm.model = "local"
# interpreter.llm.api_base = "https://localhost:8080/v1" # Llamafile default
# interpreter.llm.max_tokens = 1000
# interpreter.llm.context_window = 3000
# Hosted settings
interpreter.llm.api_key = os.getenv('OPENAI_API_KEY')
interpreter.llm.model = "gpt-4-0125-preview"
interpreter.auto_run = True
# interpreter.force_task_completion = True
### MISC SETTINGS
interpreter.offline = True
interpreter.id = 206 # Used to identify itself to other interpreters. This should be changed programatically so it's unique.
### RESET conversations/user.json
script_dir = os.path.dirname(os.path.abspath(__file__))
user_json_path = os.path.join(script_dir, 'conversations', 'user.json')
with open(user_json_path, 'w') as file:
json.dump([], file)
### START CORE
main(interpreter)

@ -1,239 +0,0 @@
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
import asyncio
import threading
import os
import pyaudio
from starlette.websockets import WebSocket
from queue import Queue
from pynput import keyboard
import json
import traceback
import websockets
import queue
import pydub
import ast
from pydub import AudioSegment
from pydub.playback import play
import io
import time
import wave
import tempfile
from datetime import datetime
from interpreter import interpreter # Just for code execution. Maybe we should let people do from interpreter.computer import run?
from utils.kernel import put_kernel_messages_into_queue
from utils.get_system_info import get_system_info
from stt import stt_wav
from utils.logs import setup_logging
from utils.logs import logger
setup_logging()
# Configuration for Audio Recording
CHUNK = 1024 # Record in chunks of 1024 samples
FORMAT = pyaudio.paInt16 # 16 bits per sample
CHANNELS = 1 # Mono
RATE = 44100 # Sample rate
RECORDING = False # Flag to control recording state
SPACEBAR_PRESSED = False # Flag to track spacebar press state
# Specify OS
current_platform = get_system_info()
# Initialize PyAudio
p = pyaudio.PyAudio()
def record_audio():
if os.getenv('STT_RUNNER') == "server":
# STT will happen on the server. we're sending audio.
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "start": True})
elif os.getenv('STT_RUNNER') == "device":
# STT will happen here, on the device. we're sending text.
send_queue.put({"role": "user", "type": "message", "start": True})
else:
raise Exception("STT_RUNNER must be set to either 'device' or 'server'.")
"""Record audio from the microphone and add it to the queue."""
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
logger.info("Recording started...")
global RECORDING
# Create a temporary WAV file to store the audio data
temp_dir = tempfile.gettempdir()
wav_path = os.path.join(temp_dir, f"audio_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
wav_file = wave.open(wav_path, 'wb')
wav_file.setnchannels(CHANNELS)
wav_file.setsampwidth(p.get_sample_size(FORMAT))
wav_file.setframerate(RATE)
while RECORDING:
data = stream.read(CHUNK, exception_on_overflow=False)
wav_file.writeframes(data)
wav_file.close()
stream.stop_stream()
stream.close()
logger.info("Recording stopped.")
duration = wav_file.getnframes() / RATE
if duration < 0.3:
# Just pressed it. Send stop message
if os.getenv('STT_RUNNER') == "device":
send_queue.put({"role": "user", "type": "message", "content": "stop"})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "content": ""})
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "end": True})
else:
if os.getenv('STT_RUNNER') == "device":
# Run stt then send text
text = stt_wav(wav_path)
send_queue.put({"role": "user", "type": "message", "content": text})
send_queue.put({"role": "user", "type": "message", "end": True})
else:
# Stream audio
with open(wav_path, 'rb') as audio_file:
byte_data = audio_file.read(CHUNK)
while byte_data:
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "content": str(byte_data)})
byte_data = audio_file.read(CHUNK)
send_queue.put({"role": "user", "type": "audio", "format": "audio/wav", "end": True})
if os.path.exists(wav_path):
os.remove(wav_path)
def toggle_recording(state):
"""Toggle the recording state."""
global RECORDING, SPACEBAR_PRESSED
if state and not SPACEBAR_PRESSED:
SPACEBAR_PRESSED = True
if not RECORDING:
RECORDING = True
threading.Thread(target=record_audio).start()
elif not state and SPACEBAR_PRESSED:
SPACEBAR_PRESSED = False
RECORDING = False
def on_press(key):
"""Detect spacebar press."""
if key == keyboard.Key.space:
toggle_recording(True)
def on_release(key):
"""Detect spacebar release and CTRL-C key press."""
if key == keyboard.Key.space:
toggle_recording(False)
elif key == keyboard.Key.esc:
logger.info("Exiting...")
os._exit(0)
import asyncio
send_queue = queue.Queue()
async def message_sender(websocket):
while True:
message = await asyncio.get_event_loop().run_in_executor(None, send_queue.get)
await websocket.send(json.dumps(message))
send_queue.task_done()
async def websocket_communication(WS_URL):
while True:
try:
async with websockets.connect(WS_URL) as websocket:
logger.info("Press the spacebar to start/stop recording. Press ESC to exit.")
asyncio.create_task(message_sender(websocket))
initial_message = {"role": None, "type": None, "format": None, "content": None}
message_so_far = initial_message
while True:
message = await websocket.recv()
logger.debug(f"Got this message from the server: {type(message)} {message}")
if type(message) == str:
message = json.loads(message)
if message.get("end"):
logger.debug(f"Complete message from the server: {message_so_far}")
logger.info("\n")
message_so_far = initial_message
if "content" in message:
print(message['content'], end="", flush=True)
if any(message_so_far[key] != message[key] for key in message_so_far if key != "content"):
message_so_far = message
else:
message_so_far["content"] += message["content"]
if message["type"] == "audio" and "content" in message:
audio_bytes = bytes(ast.literal_eval(message["content"]))
# Convert bytes to audio file
audio_file = io.BytesIO(audio_bytes)
audio = AudioSegment.from_mp3(audio_file)
# Play the audio
play(audio)
await asyncio.sleep(1)
# Run the code if that's the device's job
if os.getenv('CODE_RUNNER') == "device":
if message["type"] == "code" and "end" in message:
language = message_so_far["format"]
code = message_so_far["content"]
result = interpreter.computer.run(language, code)
send_queue.put(result)
except:
# traceback.print_exc()
logger.info(f"Connecting to `{WS_URL}`...")
await asyncio.sleep(2)
if __name__ == "__main__":
async def main():
# Configuration for WebSocket
WS_URL = os.getenv('SERVER_CONNECTION_URL')
if not WS_URL:
raise ValueError("The environment variable SERVER_CONNECTION_URL is not set. Please set it to proceed.")
# Start the WebSocket communication
asyncio.create_task(websocket_communication(WS_URL))
# Start watching the kernel if it's your job to do that
if os.getenv('CODE_RUNNER') == "device":
asyncio.create_task(put_kernel_messages_into_queue(send_queue))
#If Raspberry Pi, add the button listener, otherwise use the spacebar
if current_platform.startswith("raspberry-pi"):
logger.info("Raspberry Pi detected, using button on GPIO pin 15")
# Use GPIO pin 15
pindef = ["gpiochip4", "15"] # gpiofind PIN15
print("PINDEF", pindef)
# HACK: needs passwordless sudo
process = await asyncio.create_subprocess_exec("sudo", "gpiomon", "-brf", *pindef, stdout=asyncio.subprocess.PIPE)
while True:
line = await process.stdout.readline()
if line:
line = line.decode().strip()
if "FALLING" in line:
toggle_recording(False)
elif "RISING" in line:
toggle_recording(True)
else:
break
else:
# Keyboard listener for spacebar press/release
listener = keyboard.Listener(on_press=on_press, on_release=on_release)
listener.start()
asyncio.run(main())
p.terminate()

@ -1,103 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Rotating Glowing Circle</title>
<style>
body,
html {
height: 100%;
margin: 0;
display: flex;
justify-content: center;
align-items: center;
background-color: black;
}
.circles {
margin: 0;
display: flex;
justify-content: center;
align-items: center;
width: 200px;
height: 200px;
border-radius: 50%;
animation: rotator 48s linear infinite;
}
.center-circle {
position: absolute;
width: 200px;
height: 200px;
border: 1px solid white;
border-radius: 50%;
background-color: transparent;
}
.center-circle-2 {
position: absolute;
width: 190px;
height: 190px;
opacity: 0.2;
border: 1px solid white;
border-radius: 50%;
background-color: transparent;
}
.glow-circle {
position: absolute;
width: 250px;
height: 250px;
border-radius: 50%;
background-color: transparent;
box-shadow: 0 0 60px 30px black;
/* Initial position of the glow circle, offset from the center */
top: 50%;
left: 50%;
margin-top: -125px;
/* Half the height of the circle */
margin-left: -125px;
/* Half the width of the circle */
/* Animation properties */
animation: rotateAround 6s linear infinite;
}
@keyframes rotateAround {
0% {
transform: translateX(240px) translateY(240px);
}
50% {
transform: translateX(0px) translateY(0px);
}
100% {
transform: translateX(-240px) translateY(-240px);
}
}
@keyframes rotator {
0% {
transform: rotate(0deg);
}
100% {
transform: rotate(360deg);
}
}
</style>
</head>
<body>
<div class="circles">
<div class="center-circle"></div>
<div class="glow-circle"></div>
<div class="center-circle-2"></div>
</div>
</body>
</html>

@ -1,16 +0,0 @@
<div class="centered-circle"></div>
<script>
ws = new WebSocket("ws://localhost/server")
ws.onmessage = event => {
if (event.data == "user_start_message") {
document.body.style.backgroundColor = "white"
document.querySelector('.centered-circle')
.style.backgroundColor = "black"
} else if (event.data == "user_end_message") {
document.body.style.backgroundColor = "black"
document.querySelector('.centered-circle')
.style.backgroundColor = "white"
}
}
</script>

@ -1,103 +0,0 @@
import redis
import RPi.GPIO as GPIO
import asyncio
import websockets
import sounddevice as sd
import numpy as np
import time
import re
def transcribe(audio_chunks):
pass # (todo)
def say(text):
# This should immediatly stop if button is pressed (if GPIO.input(18))
pass # (todo)
# Connect to button
GPIO.setmode(GPIO.BCM)
GPIO.setup(18, GPIO.IN, pull_up_down=GPIO.PUD_UP)
# Set the duration and sample rate for the mic
chunk_duration = 0.5 # seconds
sample_rate = 44100 # Hz
# Set up Redis connection
r = redis.Redis(host='localhost', port=6379, db=0)
# Set up websocket connection
websocket = websockets.connect('ws://localhost:8765')
# This is so we only say() full sentences
accumulated_text = ""
def is_full_sentence(text):
return text.endswith(('.', '!', '?'))
def split_into_sentences(text):
return re.split(r'(?<=[.!?])\s+', text)
async def send_to_websocket(message):
async with websocket as ws:
await ws.send(message)
async def check_websocket():
async with websocket as ws:
message = await ws.recv()
return message
def main():
while True:
# If the button is pushed down
if not GPIO.input(18):
# Tell websocket and core that the user is speaking
send_to_websocket({"role": "user", "type": "message", "start": True}) # Standard start flag, required per streaming LMC protocol (https://docs.openinterpreter.com/guides/streaming-response)
r.rpush('to_core', {"role": "user", "type": "message", "content": "stop"}) # Custom stop message. Core is not streaming LMC (it's static LMC) so doesn't require that ^ flag
# Record audio from the microphone in chunks
audio_chunks = []
# Continue recording until the button is released
while not GPIO.input(18):
chunk = sd.rec(int(chunk_duration * sample_rate), samplerate=sample_rate, channels=2)
sd.wait() # Wait until recording is finished
audio_chunks.append(chunk)
# Transcribe
text = transcribe(audio_chunks)
message = {"role": "user", "type": "message", "content": text, "time": time.time()}
# Send message to core and websocket
r.rpush('to_core', message)
send_to_websocket(message)
# Send user message end flag to websocket, required per streaming LMC protocol
send_to_websocket({"role": "user", "type": "message", "end": True})
# Send out anything in the to_interface queue
chunk = r.lpop('to_interface')
if chunk:
send_to_websocket(chunk)
accumulated_text += chunk["content"]
# Speak full sentences out loud
sentences = split_into_sentences(accumulated_text)
if is_full_sentence(sentences[-1]):
for sentence in sentences:
say(sentence)
accumulated_text = ""
else:
for sentence in sentences[:-1]:
say(sentence)
accumulated_text = sentences[-1]
else:
say(accumulated_text)
accumulated_text = ""
message = check_websocket()
if message:
r.rpush('to_core', message)
if __name__ == "__main__":
main()

@ -1,57 +0,0 @@
"""
Listens to chunks of audio recorded by user.
Run `python listen.py` to start the server, then `cd user` and run `python record.py` to record audio.
"""
from fastapi import FastAPI, WebSocket
import uvicorn
import json
from stt import stt
import tempfile
app = FastAPI()
@app.websocket("/user")
async def user(ws: WebSocket):
await ws.accept()
audio_file = bytearray()
mime_type = None
try:
while True:
message = await ws.receive()
if message['type'] == 'websocket.disconnect':
break
if message['type'] == 'websocket.receive':
if 'text' in message:
control_message = json.loads(message['text'])
if control_message.get('action') == 'command' and control_message.get('state') == 'start' and 'mimeType' in control_message:
# This indicates the start of a new audio file
mime_type = control_message.get('mimeType')
elif control_message.get('action') == 'command' and control_message.get('state') == 'end':
# This indicates the end of the audio file
# Process the complete audio file here
transcription = stt(audio_file, mime_type)
await ws.send_json({"transcript": transcription})
print("SENT TRANSCRIPTION!")
# Reset the bytearray for the next audio file
audio_file = bytearray()
mime_type = None
elif 'bytes' in message:
# If it's not a control message, it's part of the audio file
audio_file.extend(message['bytes'])
except Exception as e:
print(f"WebSocket connection closed with exception: {e}")
finally:
await ws.close()
print("WebSocket connection closed")
if __name__ == "__main__":
with tempfile.TemporaryDirectory():
uvicorn.run(app, host="0.0.0.0", port=8000)

@ -1,146 +0,0 @@
"""
Handles everything the user interacts through.
Connects to a websocket at /user. Sends shit to it, and displays/plays the shit it sends back.
For now, just handles a spacebar being pressed for the duration it's pressed,
it should record audio.
"""
import os
import pyaudio
import threading
import asyncio
import websocket
import time
import json
from pynput import keyboard
import wave
import tempfile
from datetime import datetime
# Configuration
chunk = 1024 # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16 # 16 bits per sample
channels = 1 # Stereo
fs = 48000 # Sample rate
p = pyaudio.PyAudio() # Create an interface to PortAudio
frames = [] # Initialize array to store frames
recording = False # Flag to control recording state
ws_chunk_size = 4096 # Websocket stream chunk size
port = os.getenv('ASSISTANT_PORT', 8000)
ws_url = f"ws://localhost:{port}/user"
while True:
try:
ws = websocket.create_connection(ws_url)
break
except ConnectionRefusedError:
time.sleep(1)
async def start_recording():
global recording
if recording:
return # Avoid multiple starts
recording = True
frames.clear() # Clear existing frames
stream = p.open(format=sample_format,
channels=channels,
rate=fs,
frames_per_buffer=chunk,
input=True)
print("Recording started...")
async with websockets.connect("ws://localhost:8000/user") as websocket:
# Send the start command with mime type
await websocket.send(json.dumps({"role": "user", "type": "audio", "format": "audio/wav", "start": True}))
while recording:
data = stream.read(chunk)
frames.append(data)
stream.stop_stream()
stream.close()
try:
file_path = save_recording(frames)
with open(file_path, 'rb') as audio_file:
byte_chunk = audio_file.read(ws_chunk_size)
while byte_chunk:
await websocket.send(json.dumps({"role": "user", "type": "audio", "format": "audio/wav", "content": str(byte_chunk)}))
byte_chunk = audio_file.read(ws_chunk_size)
finally:
os.remove(file_path)
# Send the end command
await websocket.send(json.dumps({"role": "user", "type": "audio", "format": "audio/wav", "end": True}))
# Receive a json message and then close the connection
message = await websocket.recv()
print("Received message:", json.loads(message))
print("Recording stopped.")
def save_recording(frames) -> str:
# Save the recorded data as a WAV file
temp_dir = tempfile.gettempdir()
# Create a temporary file with the appropriate extension
output_path = os.path.join(temp_dir, f"input_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
with wave.open(output_path, 'wb') as wf:
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(fs)
wf.writeframes(b''.join(frames))
return output_path
def start_recording_sync():
# Create a new event loop for the thread
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Run the asyncio event loop
loop.run_until_complete(start_recording())
loop.close()
def stop_recording():
global recording
recording = False
print("Stopped recording")
def toggle_recording():
global recording
if recording:
stop_recording()
else:
# Start recording in a new thread to avoid blocking
print("Starting recording")
threading.Thread(target=start_recording_sync).start()
is_space_pressed = False # Flag to track the state of the spacebar
def on_press(key):
global is_space_pressed
if key == keyboard.Key.space and not is_space_pressed:
is_space_pressed = True
toggle_recording()
def on_release(key):
global is_space_pressed
if key == keyboard.Key.space and is_space_pressed:
is_space_pressed = False
stop_recording()
if key == keyboard.Key.esc:
# Stop listener
return False
# Collect events until released
with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
with tempfile.TemporaryDirectory():
print("Press the spacebar to start/stop recording. Press ESC to exit.")
listener.join()
p.terminate()

@ -1,32 +0,0 @@
"""
Exposes a SSE streaming server endpoint at /run, which recieves language and code,
and streams the output.
"""
from dotenv import load_dotenv
load_dotenv() # take environment variables from .env.
import os
import json
from interpreter import interpreter
import uvicorn
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
class Code(BaseModel):
language: str
code: str
app = FastAPI()
@app.post("/run")
async def run_code(code: Code):
def generator():
for chunk in interpreter.computer.run(code.language, code.code):
yield json.dumps(chunk)
return StreamingResponse(generator())
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv('COMPUTER_PORT', 9000)))

5006
01OS/poetry.lock generated

File diff suppressed because one or more lines are too long

@ -22,9 +22,10 @@ python-dotenv = "^1.0.1"
ffmpeg-python = "^0.2.0"
textual = "^0.50.1"
pydub = "^0.25.1"
open-interpreter = "^0.2.0"
ngrok = "^1.0.0"
simpleaudio = "^1.0.4"
opencv-python = "^4.9.0.80"
open-interpreter = {version = "0.2.1-pre-r", extras = ["os"]}
[build-system]
requires = ["poetry-core"]

@ -74,6 +74,10 @@ if [[ "$@" == *"--clear-local"* ]]; then
exit 0
fi
### SKILLS PATH
OI_SKILLS_PATH="$SCRIPT_DIR/01OS/server/skills"
### SETUP
if [[ "$ALL_LOCAL" == "True" ]]; then

@ -8,7 +8,7 @@ echo "Starting up localtunnel service for port $SERVER_LOCAL_PORT on localhost..
npx localtunnel --port $SERVER_LOCAL_PORT | while IFS= read -r line; do
if [[ "$line" == "your url is: https://"* ]]; then
echo "Tunnel is up!"
echo "Please set your client env variable for SERVER_CONNECTION_URL=wss://${line:21}"
echo "Please set your client env variable for SERVER_URL=wss://${line:21}"
break
fi
done

Loading…
Cancel
Save