diff --git a/01OS/01OS/clients/esp32/playback/playback.ino b/01OS/01OS/clients/esp32/playback/playback.ino index 7f2e2a3..d4cc29a 100644 --- a/01OS/01OS/clients/esp32/playback/playback.ino +++ b/01OS/01OS/clients/esp32/playback/playback.ino @@ -70,6 +70,7 @@ void hexdump(const void *mem, uint32_t len, uint8_t cols = 16) { } void InitI2SSpeakerOrMic(int mode) { + Serial.printf("InitI2sSpeakerOrMic %d\n", mode); esp_err_t err = ESP_OK; i2s_driver_uninstall(SPEAKER_I2S_NUMBER); @@ -136,17 +137,19 @@ void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) { break; case WStype_TEXT: Serial.printf("[WSc] get text: %s\n", payload); - if ((char)payload[0] == 's'){ - Serial.println("start"); - speaker_offset = 0; - InitI2SSpeakerOrMic(MODE_SPK); + { + std::string str(payload, payload + length); + bool isAudio = str.find("\"audio\"") != std::string::npos; + if (isAudio && str.find("\"start\"") != std::string::npos) { + Serial.println("start playback"); + speaker_offset = 0; + InitI2SSpeakerOrMic(MODE_SPK); + } else if (isAudio && str.find("\"end\"") != std::string::npos) { + Serial.println("end playback"); + // speaker_play(speakerdata0, speaker_offset); + // speaker_offset = 0; + } } - if ((char)payload[0] == 'e'){ - Serial.println("end"); - // speaker_play(speakerdata0, speaker_offset); - // speaker_offset = 0; - } - // send message to server // webSocket.sendTXT("message here"); break; @@ -180,12 +183,12 @@ void websocket_setup() { Serial.println("connecting to WiFi"); } Serial.println("connected to WiFi"); - webSocket.begin(COMPUTER_IP, 9001, "/"); + webSocket.begin(COMPUTER_IP, 8000, "/"); webSocket.onEvent(webSocketEvent); // webSocket.setAuthorization("user", "Password"); webSocket.setReconnectInterval(5000); - } + void setup() { M5.begin(true, false, true); M5.dis.drawpix(0, CRGB(128, 128, 0)); @@ -208,17 +211,19 @@ void loop() { button.loop(); if (button.justPressed()) { Serial.println("Recording..."); - webSocket.sendTXT("s"); + webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"start\": true}"); InitI2SSpeakerOrMic(MODE_MIC); recording = true; + data_offset = 0; + Serial.println("Recording ready."); } else if (button.justReleased()) { Serial.println("Stopped recording."); - webSocket.sendTXT("e"); + webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"end\": true}"); flush_microphone(); recording = false; - } - - if (recording) { + data_offset = 0; + } else if (recording) { + Serial.printf("Reading chunk at %d...\n", data_offset); size_t bytes_read; i2s_read( SPEAKER_I2S_NUMBER, @@ -226,13 +231,13 @@ void loop() { DATA_SIZE, &bytes_read, (100 / portTICK_RATE_MS) ); data_offset += bytes_read; + Serial.printf("Read %d bytes in chunk.\n", bytes_read); - if (data_offset > 1024*10) { + if (data_offset > 1024*9) { flush_microphone(); } } M5.update(); webSocket.loop(); - } diff --git a/01OS/01OS/server/i.py b/01OS/01OS/server/i.py index 3a68101..2ab2bb2 100644 --- a/01OS/01OS/server/i.py +++ b/01OS/01OS/server/i.py @@ -43,6 +43,18 @@ def configure_interpreter(interpreter: OpenInterpreter): Remember: You can run Python code. Be very concise. Ensure that you actually run code every time! THIS IS IMPORTANT. You NEED to write code. **Help the user by being very concise in your answers.** Do not break down tasks excessively, just into simple, few minute steps. Don't assume the user lives their life in a certain way— pick very general tasks if you're breaking a task down. + ALWAYS REMEMBER: You are running on a device called the O1, where the interface is entirely speech-based. Keep your responses succint in light of this! + IF YOU NEED TO THINK ABOUT A PROBLEM: (such as "Here's the plan:"), WRITE IT IN THE COMMENTS of the code block! + For example: + > User: What is 432/7? + > Assistant: Let me use Python to calculate that. + > Assistant Python function call: + > # Here's the plan: + > # 1. Divide the numbers + > # 2. Round it to 3 digits. + > print(round(432/7, 3)) + > Assistant: 432 / 7 is 61.714. + Use the following functions (assume they're imported) to complete your goals whenever possible: {{ import sys @@ -70,7 +82,9 @@ print(output) """.strip() - interpreter.custom_instructions = system_message + # interpreter.custom_instructions = system_message + interpreter.system_message = system_message + interpreter.llm.supports_functions = True ### LLM SETTINGS diff --git a/01OS/01OS/server/server.py b/01OS/01OS/server/server.py index 553ee27..d05ee6c 100644 --- a/01OS/01OS/server/server.py +++ b/01OS/01OS/server/server.py @@ -201,7 +201,7 @@ async def listener(): accumulated_text = "" - for chunk in interpreter.chat(messages, stream=True, display=False): + for chunk in interpreter.chat(messages, stream=True, display=True): logger.debug("Got chunk:", chunk) @@ -212,7 +212,7 @@ async def listener(): if os.getenv('TTS_RUNNER') == "server": # Speak full sentences out loud - if chunk["role"] == "assistant" and "content" in chunk: + if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message": accumulated_text += chunk["content"] sentences = split_into_sentences(accumulated_text) @@ -241,7 +241,7 @@ async def listener(): # Check if it's just an end flag. We ignore those. temp_message = await from_user.get() - if temp_message == {'role': 'user', 'type': 'message', 'end': True}: + if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"): # Yup. False alarm. continue else: @@ -251,8 +251,9 @@ async def listener(): with open(conversation_history_path, 'w') as file: json.dump(interpreter.messages, file, indent=4) - logger.info("New user message recieved. Breaking.") - break + # TODO: is triggering seemingly randomly + #logger.info("New user message recieved. Breaking.") + #break # Also check if there's any new computer messages if not from_computer.empty(): diff --git a/01OS/01OS/server/stt/stt.py b/01OS/01OS/server/stt/stt.py index 2a57c9b..588e9e4 100644 --- a/01OS/01OS/server/stt/stt.py +++ b/01OS/01OS/server/stt/stt.py @@ -25,6 +25,8 @@ def convert_mime_type_to_format(mime_type: str) -> str: return "wav" if mime_type == "audio/webm": return "webm" + if mime_type == "audio/raw": + return "dat" return mime_type @@ -43,7 +45,16 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str: # Export to wav output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav") - ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run() + print(mime_type, input_path, output_path) + if mime_type == "audio/raw": + ffmpeg.input( + input_path, + f='s16le', + ar='16000', + ac=1, + ).output(output_path).run() + else: + ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run() try: yield output_path diff --git a/01OS/01OS/server/tts/tts.py b/01OS/01OS/server/tts/tts.py index 6106966..ec16cc0 100644 --- a/01OS/01OS/server/tts/tts.py +++ b/01OS/01OS/server/tts/tts.py @@ -6,6 +6,7 @@ from pydub import AudioSegment from dotenv import load_dotenv load_dotenv() # take environment variables from .env. +import ffmpeg import tempfile from openai import OpenAI import os @@ -28,11 +29,17 @@ def stream_tts(text): input=text, response_format="opus" ) - with tempfile.NamedTemporaryFile(suffix=".opus") as temp_file: + with tempfile.NamedTemporaryFile(suffix=".opus", delete=False) as temp_file: response.stream_to_file(temp_file.name) - audio_bytes = temp_file.read() - file_type = "bytes.opus" + # TODO: hack to format audio correctly for device + outfile = tempfile.gettempdir() + "/" + "raw.dat" + ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run() + with open(outfile, "rb") as f: + audio_bytes = f.read() + file_type = "bytes.raw" + print(outfile, len(audio_bytes)) + os.remove(outfile) else: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: