Merge pull request #48 from vgel/theia/websockets

hacky device support
pull/47/head^2
killian 11 months ago committed by GitHub
commit 387bc00a20
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -70,6 +70,7 @@ void hexdump(const void *mem, uint32_t len, uint8_t cols = 16) {
} }
void InitI2SSpeakerOrMic(int mode) { void InitI2SSpeakerOrMic(int mode) {
Serial.printf("InitI2sSpeakerOrMic %d\n", mode);
esp_err_t err = ESP_OK; esp_err_t err = ESP_OK;
i2s_driver_uninstall(SPEAKER_I2S_NUMBER); i2s_driver_uninstall(SPEAKER_I2S_NUMBER);
@ -136,17 +137,19 @@ void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
break; break;
case WStype_TEXT: case WStype_TEXT:
Serial.printf("[WSc] get text: %s\n", payload); Serial.printf("[WSc] get text: %s\n", payload);
if ((char)payload[0] == 's'){ {
Serial.println("start"); std::string str(payload, payload + length);
bool isAudio = str.find("\"audio\"") != std::string::npos;
if (isAudio && str.find("\"start\"") != std::string::npos) {
Serial.println("start playback");
speaker_offset = 0; speaker_offset = 0;
InitI2SSpeakerOrMic(MODE_SPK); InitI2SSpeakerOrMic(MODE_SPK);
} } else if (isAudio && str.find("\"end\"") != std::string::npos) {
if ((char)payload[0] == 'e'){ Serial.println("end playback");
Serial.println("end");
// speaker_play(speakerdata0, speaker_offset); // speaker_play(speakerdata0, speaker_offset);
// speaker_offset = 0; // speaker_offset = 0;
} }
}
// send message to server // send message to server
// webSocket.sendTXT("message here"); // webSocket.sendTXT("message here");
break; break;
@ -180,12 +183,12 @@ void websocket_setup() {
Serial.println("connecting to WiFi"); Serial.println("connecting to WiFi");
} }
Serial.println("connected to WiFi"); Serial.println("connected to WiFi");
webSocket.begin(COMPUTER_IP, 9001, "/"); webSocket.begin(COMPUTER_IP, 8000, "/");
webSocket.onEvent(webSocketEvent); webSocket.onEvent(webSocketEvent);
// webSocket.setAuthorization("user", "Password"); // webSocket.setAuthorization("user", "Password");
webSocket.setReconnectInterval(5000); webSocket.setReconnectInterval(5000);
} }
void setup() { void setup() {
M5.begin(true, false, true); M5.begin(true, false, true);
M5.dis.drawpix(0, CRGB(128, 128, 0)); M5.dis.drawpix(0, CRGB(128, 128, 0));
@ -208,17 +211,19 @@ void loop() {
button.loop(); button.loop();
if (button.justPressed()) { if (button.justPressed()) {
Serial.println("Recording..."); Serial.println("Recording...");
webSocket.sendTXT("s"); webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"start\": true}");
InitI2SSpeakerOrMic(MODE_MIC); InitI2SSpeakerOrMic(MODE_MIC);
recording = true; recording = true;
data_offset = 0;
Serial.println("Recording ready.");
} else if (button.justReleased()) { } else if (button.justReleased()) {
Serial.println("Stopped recording."); Serial.println("Stopped recording.");
webSocket.sendTXT("e"); webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"end\": true}");
flush_microphone(); flush_microphone();
recording = false; recording = false;
} data_offset = 0;
} else if (recording) {
if (recording) { Serial.printf("Reading chunk at %d...\n", data_offset);
size_t bytes_read; size_t bytes_read;
i2s_read( i2s_read(
SPEAKER_I2S_NUMBER, SPEAKER_I2S_NUMBER,
@ -226,13 +231,13 @@ void loop() {
DATA_SIZE, &bytes_read, (100 / portTICK_RATE_MS) DATA_SIZE, &bytes_read, (100 / portTICK_RATE_MS)
); );
data_offset += bytes_read; data_offset += bytes_read;
Serial.printf("Read %d bytes in chunk.\n", bytes_read);
if (data_offset > 1024*10) { if (data_offset > 1024*9) {
flush_microphone(); flush_microphone();
} }
} }
M5.update(); M5.update();
webSocket.loop(); webSocket.loop();
} }

@ -43,6 +43,18 @@ def configure_interpreter(interpreter: OpenInterpreter):
Remember: You can run Python code. Be very concise. Ensure that you actually run code every time! THIS IS IMPORTANT. You NEED to write code. **Help the user by being very concise in your answers.** Do not break down tasks excessively, just into simple, few minute steps. Don't assume the user lives their life in a certain way— pick very general tasks if you're breaking a task down. Remember: You can run Python code. Be very concise. Ensure that you actually run code every time! THIS IS IMPORTANT. You NEED to write code. **Help the user by being very concise in your answers.** Do not break down tasks excessively, just into simple, few minute steps. Don't assume the user lives their life in a certain way— pick very general tasks if you're breaking a task down.
ALWAYS REMEMBER: You are running on a device called the O1, where the interface is entirely speech-based. Keep your responses succint in light of this!
IF YOU NEED TO THINK ABOUT A PROBLEM: (such as "Here's the plan:"), WRITE IT IN THE COMMENTS of the code block!
For example:
> User: What is 432/7?
> Assistant: Let me use Python to calculate that.
> Assistant Python function call:
> # Here's the plan:
> # 1. Divide the numbers
> # 2. Round it to 3 digits.
> print(round(432/7, 3))
> Assistant: 432 / 7 is 61.714.
Use the following functions (assume they're imported) to complete your goals whenever possible: Use the following functions (assume they're imported) to complete your goals whenever possible:
{{ {{
import sys import sys
@ -70,7 +82,9 @@ print(output)
""".strip() """.strip()
interpreter.custom_instructions = system_message # interpreter.custom_instructions = system_message
interpreter.system_message = system_message
interpreter.llm.supports_functions = True
### LLM SETTINGS ### LLM SETTINGS

@ -201,7 +201,7 @@ async def listener():
accumulated_text = "" accumulated_text = ""
for chunk in interpreter.chat(messages, stream=True, display=False): for chunk in interpreter.chat(messages, stream=True, display=True):
logger.debug("Got chunk:", chunk) logger.debug("Got chunk:", chunk)
@ -212,7 +212,7 @@ async def listener():
if os.getenv('TTS_RUNNER') == "server": if os.getenv('TTS_RUNNER') == "server":
# Speak full sentences out loud # Speak full sentences out loud
if chunk["role"] == "assistant" and "content" in chunk: if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
accumulated_text += chunk["content"] accumulated_text += chunk["content"]
sentences = split_into_sentences(accumulated_text) sentences = split_into_sentences(accumulated_text)
@ -241,7 +241,7 @@ async def listener():
# Check if it's just an end flag. We ignore those. # Check if it's just an end flag. We ignore those.
temp_message = await from_user.get() temp_message = await from_user.get()
if temp_message == {'role': 'user', 'type': 'message', 'end': True}: if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
# Yup. False alarm. # Yup. False alarm.
continue continue
else: else:
@ -251,8 +251,9 @@ async def listener():
with open(conversation_history_path, 'w') as file: with open(conversation_history_path, 'w') as file:
json.dump(interpreter.messages, file, indent=4) json.dump(interpreter.messages, file, indent=4)
logger.info("New user message recieved. Breaking.") # TODO: is triggering seemingly randomly
break #logger.info("New user message recieved. Breaking.")
#break
# Also check if there's any new computer messages # Also check if there's any new computer messages
if not from_computer.empty(): if not from_computer.empty():

@ -25,6 +25,8 @@ def convert_mime_type_to_format(mime_type: str) -> str:
return "wav" return "wav"
if mime_type == "audio/webm": if mime_type == "audio/webm":
return "webm" return "webm"
if mime_type == "audio/raw":
return "dat"
return mime_type return mime_type
@ -43,6 +45,15 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
# Export to wav # Export to wav
output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav") output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
print(mime_type, input_path, output_path)
if mime_type == "audio/raw":
ffmpeg.input(
input_path,
f='s16le',
ar='16000',
ac=1,
).output(output_path).run()
else:
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run() ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
try: try:

@ -6,6 +6,7 @@ from pydub import AudioSegment
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() # take environment variables from .env. load_dotenv() # take environment variables from .env.
import ffmpeg
import tempfile import tempfile
from openai import OpenAI from openai import OpenAI
import os import os
@ -28,11 +29,17 @@ def stream_tts(text):
input=text, input=text,
response_format="opus" response_format="opus"
) )
with tempfile.NamedTemporaryFile(suffix=".opus") as temp_file: with tempfile.NamedTemporaryFile(suffix=".opus", delete=False) as temp_file:
response.stream_to_file(temp_file.name) response.stream_to_file(temp_file.name)
audio_bytes = temp_file.read() # TODO: hack to format audio correctly for device
file_type = "bytes.opus" outfile = tempfile.gettempdir() + "/" + "raw.dat"
ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
with open(outfile, "rb") as f:
audio_bytes = f.read()
file_type = "bytes.raw"
print(outfile, len(audio_bytes))
os.remove(outfile)
else: else:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:

Loading…
Cancel
Save