diff --git a/.gitignore b/.gitignore index 5ce1b58..ff21f10 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,12 @@ ggml-*.bin -OS/01/local_tts/* # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class 01OS/01OS/server/conversations/user.json +01OS/01OS/server/tts/local_service/* +01OS/01OS/server/stt/local_service/* # C extensions *.so diff --git a/01OS/01OS/clients/base_device.py b/01OS/01OS/clients/base_device.py index dede881..f2197a4 100644 --- a/01OS/01OS/clients/base_device.py +++ b/01OS/01OS/clients/base_device.py @@ -133,7 +133,7 @@ class Device: """Detect spacebar release and ESC key press.""" if key == keyboard.Key.space: self.toggle_recording(False) - elif key == keyboard.Key.esc or key == keyboard.Key.ctrl_c: + elif key == keyboard.Key.esc or (key == keyboard.Key.ctrl and keyboard.Key.c): logger.info("Exiting...") os._exit(0) diff --git a/01OS/README.md b/01OS/README.md index ac79cfa..9b773c8 100644 --- a/01OS/README.md +++ b/01OS/README.md @@ -1,81 +1,33 @@ -# New: The 8th Architecture +The open-source language model computer. +```bash +pip install 01OS ``` -/01 - start.sh # entrypoint, runs server, device, llm - server.py # uses tts and stt if it must, exposes "/" - device.py # also uses tts and stt, hits "/" - llm.py # starts an openai-compatible server - model.llamafile - i.py # creates an interpreter which server just imports - tts.py - stt.py - /conversations - user.json - /skills # files in here will run in the 01's interpreter - schedule.py - ... -``` - -This is flatter and simpler. - -**Device** handles the device — i.e. everything the user interacts + watching the kernel + running code (which produces `computer` LMC messages) if `DEVICE_EXECUTE_CODE` is true. Runs TTS and STT, sends LMC messages to "/". - -**Server** serves "/", a websocket that accepts `user` LMC messages and sends back `assistant` LMC messages. Runs code (which produces `computer` LMC messages) if `SERVER_EXECUTE_CODE` is true. - -**Llm** starts an OpenAI-compatible server with `model.llamafile`. Downloads a heavily quantized Phi-2 if `model.llamafile` doesn't exist. - -**I** creates an `interpreter` object. This is where you configure the 01's behavior. - -# What is this? - -This is the operating system that powers the 01. - -# No, I mean what's this folder? - -It's the `diff` between 01OS and Ubuntu. - -01OS should be a customized version of Linux. Ubuntu is popular, stable, runs on lots of different hardware. **(open question: Should this be Xubuntu, which is lighter? or something else?)** - -We want to _build on_ Ubuntu by customizing the stable branch programatically, not by forking it — which would mean we'd have to maintain the underlying OS, merge in security patches, etc. Yuck. -This folder contains everything we want to change from the base Ubuntu. A folder here represents a folder added/modified at the `root`. You can think of it like the `diff` between 01OS and Ubuntu. - -I imagine we'll use something like Cubic to then press this + Ubuntu into an ISO image. - -# Setup & Usage - -Clone this repo, then run `OS/01/start.sh`. - -# Structure - -### `start.sh` - -The start script's job is to start the `core` and the `app` (in full-screen mode). - -### `/core` - -The `core`'s job is to: +```bash +01 # This will run a server + attempt to determine and run a client. +# (Behavior can be modified by changing the contents of `.env`) +``` -1. Set up the language model -2. Set up the interpreter -3. Serve the interpreter at "/" +**Expose an 01 server publically:** -### `/app` +```bash +01 --server --expose # This will print a URL that a client can point to. +``` -The `app`'s job is to be the interface between the user and the interpreter (text in). This could be text only, audio, video, who knows, but it becomes LMC messages or plain text. +**Run a specific client:** -For the first version, I think we should just handle audio in/out. So the `app`'s job here is to: +```bash +01 --client macos # Options: macos, rpi +``` -1. Be a fullscreen app for the user to use 01 -2. Turn the user's speech into text and send it to "/" -3. Turn the interpreter's text into speech and play it for the user +**Run locally:** -### Changes to Linux +The current default uses OpenAI's services. -We need to make the following changes: +The `--local` flag will install and run the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) STT and [Piper](https://github.com/rhasspy/piper) TTS models. -1. Modify the bootloader to just show white circle on black -2. Auto start the start script, `start.sh` -3. Put detectors everywhere, which will put [LMC Messages](https://docs.openinterpreter.com/protocols/lmc-messages) from the computer into `/01/core/queue`. Michael suggested we simply watch and filter the `dmesg` stream (I think that's what it's called?), so I suppose we could have a script like `/01/core/kernel_watcher.py` that puts things into the queue? Honestly knowing we could get it all from one place like that— maybe this should be simpler. Is the queue necessary? How about we just expect the computer to send computer messages to the websocket at `/`? Then yeah, maybe we do have redis there, then instead of looking at that folder, we check the redis queue... -4. (open question: should we do this? do we want the first 01 to be ready for GUI control?) Make the display that's shown to the user (and filled with the `app`) the _secondary_ display. The primary display will be a normal Ubuntu desktop, invisible to the user. Why? So the interpreter can control the primary display "under the hood". +```bash +01 --local # Local client and server +01 --local --server --expose # Expose a local server +``` diff --git a/01OS/output_audio.wav b/01OS/output_audio.wav new file mode 100644 index 0000000..44ec5e6 Binary files /dev/null and b/01OS/output_audio.wav differ diff --git a/01OS/pyproject.toml b/01OS/pyproject.toml index 2f3e1d0..c36dcc3 100644 --- a/01OS/pyproject.toml +++ b/01OS/pyproject.toml @@ -4,7 +4,7 @@ packages = [ {include = "01OS"}, ] include = [".env.example", "start.py", "start.sh"] -version = "0.0.1" +version = "0.0.2" description = "The open-source language model computer" authors = ["Killian "] license = "AGPL"