From 968aa854c0ceffe995ed8755384dd2c61a138b23 Mon Sep 17 00:00:00 2001
From: Theia Vogel <theia@vgel.me>
Date: Sun, 18 Feb 2024 01:30:30 -0800
Subject: [PATCH 1/5] conform device protocol to LMC (w/ hacks)

---
 01OS/01OS/clients/esp32/playback/playback.ino | 43 +++++++++++--------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/01OS/01OS/clients/esp32/playback/playback.ino b/01OS/01OS/clients/esp32/playback/playback.ino
index 7f2e2a3..d4cc29a 100644
--- a/01OS/01OS/clients/esp32/playback/playback.ino
+++ b/01OS/01OS/clients/esp32/playback/playback.ino
@@ -70,6 +70,7 @@ void hexdump(const void *mem, uint32_t len, uint8_t cols = 16) {
 }
 
 void InitI2SSpeakerOrMic(int mode) {
+  Serial.printf("InitI2sSpeakerOrMic %d\n", mode);
   esp_err_t err = ESP_OK;
 
   i2s_driver_uninstall(SPEAKER_I2S_NUMBER);
@@ -136,17 +137,19 @@ void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
       break;
     case WStype_TEXT:
       Serial.printf("[WSc] get text: %s\n", payload);
-      if ((char)payload[0] == 's'){
-        Serial.println("start");
-        speaker_offset = 0;
-         InitI2SSpeakerOrMic(MODE_SPK);
+      {
+        std::string str(payload, payload + length);
+        bool isAudio = str.find("\"audio\"") != std::string::npos;
+        if (isAudio && str.find("\"start\"") != std::string::npos) {
+          Serial.println("start playback");
+          speaker_offset = 0;
+          InitI2SSpeakerOrMic(MODE_SPK);
+        } else if (isAudio && str.find("\"end\"") != std::string::npos) {
+          Serial.println("end playback");
+          // speaker_play(speakerdata0, speaker_offset);
+          // speaker_offset = 0;
+        }
       }
-      if ((char)payload[0] == 'e'){
-        Serial.println("end");
-        // speaker_play(speakerdata0, speaker_offset);
-        // speaker_offset = 0;
-      }
-
       // send message to server
       // webSocket.sendTXT("message here");
       break;
@@ -180,12 +183,12 @@ void websocket_setup() {
     Serial.println("connecting to WiFi");
   }
   Serial.println("connected to WiFi");
-  webSocket.begin(COMPUTER_IP, 9001, "/");
+  webSocket.begin(COMPUTER_IP, 8000, "/");
   webSocket.onEvent(webSocketEvent);
   //    webSocket.setAuthorization("user", "Password");
   webSocket.setReconnectInterval(5000);
-
 }
+
 void setup() {
   M5.begin(true, false, true);
   M5.dis.drawpix(0, CRGB(128, 128, 0));
@@ -208,17 +211,19 @@ void loop() {
   button.loop();
   if (button.justPressed()) {
     Serial.println("Recording...");
-    webSocket.sendTXT("s");
+    webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"start\": true}");
     InitI2SSpeakerOrMic(MODE_MIC);
     recording = true;
+    data_offset = 0;
+    Serial.println("Recording ready.");
   } else if (button.justReleased()) {
     Serial.println("Stopped recording.");
-    webSocket.sendTXT("e");
+    webSocket.sendTXT("{\"role\": \"user\", \"type\": \"audio\", \"format\": \"bytes.raw\", \"end\": true}");
     flush_microphone();
     recording = false;
-  }
-
-  if (recording) {
+    data_offset = 0;
+  } else if (recording) {
+    Serial.printf("Reading chunk at %d...\n", data_offset);
     size_t bytes_read;
     i2s_read(
       SPEAKER_I2S_NUMBER,
@@ -226,13 +231,13 @@ void loop() {
       DATA_SIZE, &bytes_read, (100 / portTICK_RATE_MS)
     );
     data_offset += bytes_read;
+    Serial.printf("Read %d bytes in chunk.\n", bytes_read);
 
-    if (data_offset > 1024*10) {
+    if (data_offset > 1024*9) {
       flush_microphone();
     }
   }
 
   M5.update();
   webSocket.loop();
-
 }

From c031637fff230135dfaf1f088c721fa9ae7cd064 Mon Sep 17 00:00:00 2001
From: Theia Vogel <theia@vgel.me>
Date: Sun, 18 Feb 2024 01:30:46 -0800
Subject: [PATCH 2/5] system msg / function calling tweaks

---
 01OS/01OS/server/i.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/01OS/01OS/server/i.py b/01OS/01OS/server/i.py
index 3a68101..2ab2bb2 100644
--- a/01OS/01OS/server/i.py
+++ b/01OS/01OS/server/i.py
@@ -43,6 +43,18 @@ def configure_interpreter(interpreter: OpenInterpreter):
 
     Remember: You can run Python code. Be very concise. Ensure that you actually run code every time! THIS IS IMPORTANT. You NEED to write code. **Help the user by being very concise in your answers.** Do not break down tasks excessively, just into simple, few minute steps. Don't assume the user lives their life in a certain way— pick very general tasks if you're breaking a task down.
 
+    ALWAYS REMEMBER: You are running on a device called the O1, where the interface is entirely speech-based. Keep your responses succint in light of this!
+    IF YOU NEED TO THINK ABOUT A PROBLEM: (such as "Here's the plan:"), WRITE IT IN THE COMMENTS of the code block!
+    For example:
+    > User: What is 432/7?
+    > Assistant: Let me use Python to calculate that.
+    > Assistant Python function call:
+    >   # Here's the plan:
+    >   # 1. Divide the numbers
+    >   # 2. Round it to 3 digits.
+    >   print(round(432/7, 3))
+    > Assistant: 432 / 7 is 61.714.
+
     Use the following functions (assume they're imported) to complete your goals whenever possible:
     {{
 import sys
@@ -70,7 +82,9 @@ print(output)
 
     """.strip()
 
-    interpreter.custom_instructions = system_message
+    # interpreter.custom_instructions = system_message
+    interpreter.system_message = system_message
+    interpreter.llm.supports_functions = True
 
     ### LLM SETTINGS
 

From c4fa3db52bb649278380691a6d6aa4dd1b7e3c2e Mon Sep 17 00:00:00 2001
From: Theia Vogel <theia@vgel.me>
Date: Sun, 18 Feb 2024 01:30:56 -0800
Subject: [PATCH 3/5] bug whacking

---
 01OS/01OS/server/server.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/01OS/01OS/server/server.py b/01OS/01OS/server/server.py
index 553ee27..d05ee6c 100644
--- a/01OS/01OS/server/server.py
+++ b/01OS/01OS/server/server.py
@@ -201,7 +201,7 @@ async def listener():
 
         accumulated_text = ""
         
-        for chunk in interpreter.chat(messages, stream=True, display=False):
+        for chunk in interpreter.chat(messages, stream=True, display=True):
 
             logger.debug("Got chunk:", chunk)
 
@@ -212,7 +212,7 @@ async def listener():
             
             if os.getenv('TTS_RUNNER') == "server":
                 # Speak full sentences out loud
-                if chunk["role"] == "assistant" and "content" in chunk:
+                if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
                     accumulated_text += chunk["content"]
                     sentences = split_into_sentences(accumulated_text)
                     
@@ -241,7 +241,7 @@ async def listener():
                 # Check if it's just an end flag. We ignore those.
                 temp_message = await from_user.get()
                 
-                if temp_message == {'role': 'user', 'type': 'message', 'end': True}:
+                if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
                     # Yup. False alarm.
                     continue
                 else:
@@ -251,8 +251,9 @@ async def listener():
                 with open(conversation_history_path, 'w') as file:
                     json.dump(interpreter.messages, file, indent=4)
 
-                logger.info("New user message recieved. Breaking.")
-                break
+                # TODO: is triggering seemingly randomly
+                #logger.info("New user message recieved. Breaking.")
+                #break
 
             # Also check if there's any new computer messages
             if not from_computer.empty():

From 801a049b77bdb2d983acf9a8b2c56be866168d09 Mon Sep 17 00:00:00 2001
From: Theia Vogel <theia@vgel.me>
Date: Sun, 18 Feb 2024 01:31:07 -0800
Subject: [PATCH 4/5] support audio/raw in stt

---
 01OS/01OS/server/stt/stt.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/01OS/01OS/server/stt/stt.py b/01OS/01OS/server/stt/stt.py
index 2a57c9b..588e9e4 100644
--- a/01OS/01OS/server/stt/stt.py
+++ b/01OS/01OS/server/stt/stt.py
@@ -25,6 +25,8 @@ def convert_mime_type_to_format(mime_type: str) -> str:
         return "wav"
     if mime_type == "audio/webm":
         return "webm"
+    if mime_type == "audio/raw":
+        return "dat"
 
     return mime_type
 
@@ -43,7 +45,16 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
 
     # Export to wav
     output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
-    ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
+    print(mime_type, input_path, output_path)
+    if mime_type == "audio/raw":
+        ffmpeg.input(
+            input_path,
+            f='s16le',
+            ar='16000',
+            ac=1,
+        ).output(output_path).run()
+    else:
+        ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
 
     try:
         yield output_path

From 5348f569b0ca18d4bfc790d1030d6d82caee7ca9 Mon Sep 17 00:00:00 2001
From: Theia Vogel <theia@vgel.me>
Date: Sun, 18 Feb 2024 01:31:16 -0800
Subject: [PATCH 5/5] quick hack to support audio/raw in tts

---
 01OS/01OS/server/tts/tts.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/01OS/01OS/server/tts/tts.py b/01OS/01OS/server/tts/tts.py
index 6106966..ec16cc0 100644
--- a/01OS/01OS/server/tts/tts.py
+++ b/01OS/01OS/server/tts/tts.py
@@ -6,6 +6,7 @@ from pydub import AudioSegment
 from dotenv import load_dotenv
 load_dotenv()  # take environment variables from .env.
 
+import ffmpeg
 import tempfile
 from openai import OpenAI
 import os
@@ -28,11 +29,17 @@ def stream_tts(text):
             input=text,
             response_format="opus"
         )
-        with tempfile.NamedTemporaryFile(suffix=".opus") as temp_file:
+        with tempfile.NamedTemporaryFile(suffix=".opus", delete=False) as temp_file:
             response.stream_to_file(temp_file.name)
 
-            audio_bytes = temp_file.read()
-            file_type = "bytes.opus"
+            # TODO: hack to format audio correctly for device
+            outfile = tempfile.gettempdir() + "/" + "raw.dat"
+            ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
+            with open(outfile, "rb") as f:
+                audio_bytes = f.read()
+            file_type = "bytes.raw"
+            print(outfile, len(audio_bytes))
+            os.remove(outfile)
 
     else:
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: