Merge pull request #34 from tomchapin/feature/camera-snapshots

Feature/camera snapshots (WIP)
1 year ago · f2e51dd14f
parent d4629c017c 8fb4ed6b5e
commit f2e51dd14f
7 changed files with 135 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@ -167,3 +167,4 @@ cython_debug/

 # ignore the aifs index files
 _.aifs
+01OS/output_audio.wav
--- a/01OS/.env.example
+++ b/01OS/.env.example
@ -35,6 +35,17 @@ STT_RUNNER=client # If server, audio will be sent over websocket.
 # Will expose the server publically and display that URL.
 SERVER_EXPOSE_PUBLICALLY=False

+# Image capture settings
+CAMERA_ENABLED=True
+
+# Camera device selection (Typically 0 for built-in, 1 for USB)
+CAMERA_DEVICE_INDEX=0
+
+# Camera warmup time
+# This is a workaround for some cameras that don't immediately
+# return a properly exposed picture when they are first turned on
+CAMERA_WARMUP_SECONDS=0.4
+
 # Debug level
 # LOG_LEVEL=DEBUG
 LOG_LEVEL="INFO"
--- a/01OS/01OS/clients/base_device.py
+++ b/01OS/01OS/clients/base_device.py
@ -1,6 +1,7 @@
 from dotenv import load_dotenv
 load_dotenv()  # take environment variables from .env.

+import os
 import asyncio
 import threading
 import os
@ -21,6 +22,8 @@ import time
 import wave
 import tempfile
 from datetime import datetime
+import cv2
+import base64
 from interpreter import interpreter # Just for code execution. Maybe we should let people do from interpreter.computer import run?
 # In the future, I guess kernel watching code should be elsewhere? Somewhere server / client agnostic?
 from ..server.utils.kernel import put_kernel_messages_into_queue
@ -44,6 +47,11 @@ RATE = 44100  # Sample rate
 RECORDING = False  # Flag to control recording state
 SPACEBAR_PRESSED = False  # Flag to track spacebar press state

+# Camera configuration
+CAMERA_ENABLED = bool(os.getenv('CAMERA_ENABLED', False))
+CAMERA_DEVICE_INDEX = int(os.getenv('CAMERA_DEVICE_INDEX', 0))
+CAMERA_WARMUP_SECONDS = float(os.getenv('CAMERA_WARMUP_SECONDS', 0))
+
 # Specify OS
 current_platform = get_system_info()

@ -54,8 +62,63 @@ send_queue = queue.Queue()

 class Device:
    def __init__(self):
+        self.pressed_keys = set()
+        self.captured_images = []
        self.audiosegments = []
-        pass
+
+    def fetch_image_from_camera(self, camera_index=CAMERA_DEVICE_INDEX):
+        """Captures an image from the specified camera device and saves it to a temporary file. Adds the image to the captured_images list."""
+        image_path = None
+
+        cap = cv2.VideoCapture(camera_index)
+        ret, frame = cap.read()  # Capture a single frame to initialize the camera
+
+        if CAMERA_WARMUP_SECONDS > 0:
+            # Allow camera to warm up, then snap a picture again
+            # This is a workaround for some cameras that don't return a properly exposed
+            # picture immediately when they are first turned on
+            time.sleep(CAMERA_WARMUP_SECONDS)
+            ret, frame = cap.read()
+
+        if ret:
+            temp_dir = tempfile.gettempdir()
+            image_path = os.path.join(temp_dir, f"01_photo_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.png")
+            self.captured_images.append(image_path)
+            cv2.imwrite(image_path, frame)
+            logger.info(f"Camera image captured to {image_path}")
+            logger.info(f"You now have {len(self.captured_images)} images which will be sent along with your next audio message.")
+        else:
+            logger.error(f"Error: Couldn't capture an image from camera ({camera_index})")
+
+        cap.release()
+
+        return image_path
+    
+
+    def encode_image_to_base64(self, image_path):
+        """Encodes an image file to a base64 string."""
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    def add_image_to_send_queue(self, image_path):
+        """Encodes an image and adds an LMC message to the send queue with the image data."""
+        base64_image = self.encode_image_to_base64(image_path)
+        image_message = {
+            "role": "user",
+            "type": "image",
+            "format": "base64.png",
+            "content": base64_image
+        }
+        send_queue.put(image_message)
+        # Delete the image file from the file system after sending it
+        os.remove(image_path)
+
+    def queue_all_captured_images(self):
+        """Queues all captured images to be sent."""
+        for image_path in self.captured_images:
+            self.add_image_to_send_queue(image_path)
+        self.captured_images.clear()  # Clear the list after sending
+
        
    async def play_audiosegments(self):
        """Plays them sequentially."""
@ -112,6 +175,8 @@ class Device:
                send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "content": ""})
                send_queue.put({"role": "user", "type": "audio", "format": "bytes.wav", "end": True})
        else:
+            self.queue_all_captured_images()
+
            if os.getenv('STT_RUNNER') == "client":
                # Run stt then send text
                text = stt_wav(wav_path)
@ -142,17 +207,27 @@ class Device:
            RECORDING = False

    def on_press(self, key):
-        """Detect spacebar press."""
-        if key == keyboard.Key.space:
+        """Detect spacebar press, ESC key press, and Ctrl+C combination."""
+        self.pressed_keys.add(key)  # Add the pressed key to the set
+
+        if keyboard.Key.esc in self.pressed_keys:
+            logger.info("Exiting...")
+            os._exit(0)
+        elif keyboard.Key.space in self.pressed_keys:
            self.toggle_recording(True)
+        elif {keyboard.Key.ctrl, keyboard.KeyCode.from_char('c')} <= self.pressed_keys:
+            logger.info("Ctrl+C pressed. Exiting...")
+            os._exit(0)

    def on_release(self, key):
-        """Detect spacebar release and ESC key press."""
+        """Detect spacebar release and 'c' key press for camera, and handle key release."""
+        self.pressed_keys.discard(key)  # Remove the released key from the key press tracking set
+
        if key == keyboard.Key.space:
            self.toggle_recording(False)
-        elif key == keyboard.Key.esc or (key == keyboard.Key.ctrl and keyboard.Key.c):
-            logger.info("Exiting...")
-            os._exit(0)
+        elif CAMERA_ENABLED and key == keyboard.KeyCode.from_char('c'):
+            self.fetch_image_from_camera()
+
    
    async def message_sender(self, websocket):
        while True:
@ -168,7 +243,11 @@ class Device:
        while True:
            try:
                async with websockets.connect(WS_URL) as websocket:
-                    logger.info("Press the spacebar to start/stop recording. Press ESC to exit.")
+                    if CAMERA_ENABLED:
+                        logger.info("Press the spacebar to start/stop recording. Press 'c' to capture an image from the camera. Press ESC to exit.")
+                    else:
+                        logger.info("Press the spacebar to start/stop recording. Press ESC to exit.")
+                        
                    asyncio.create_task(self.message_sender(websocket))

                    while True:
--- a/01OS/01OS/clients/start.sh
+++ b/01OS/01OS/clients/start.sh
@ -1,8 +1,8 @@
 DEVICE=$(uname -n)
 if [[ "$DEVICE" == "rpi" ]]; then
    cd 01OS
-    python -m 01OS.clients.rpi.device &
+    python -m 01OS.clients.rpi.device
 else
    cd 01OS
-    python -m 01OS.clients.macos.device &
+    python -m 01OS.clients.macos.device
 fi
--- a/01OS/poetry.lock
+++ b/01OS/poetry.lock
@ -1890,6 +1890,31 @@ typing-extensions = ">=4.7,<5"
 [package.extras]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]

+[[package]]
+name = "opencv-python"
+version = "4.9.0.80"
+description = "Wrapper package for OpenCV python bindings."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "opencv-python-4.9.0.80.tar.gz", hash = "sha256:1a9f0e6267de3a1a1db0c54213d022c7c8b5b9ca4b580e80bdc58516c922c9e1"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl", hash = "sha256:7e5f7aa4486651a6ebfa8ed4b594b65bd2d2f41beeb4241a3e4b1b85acbbbadb"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71dfb9555ccccdd77305fc3dcca5897fbf0cf28b297c51ee55e079c065d812a3"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b34a52e9da36dda8c151c6394aed602e4b17fa041df0b9f5b93ae10b0fcca2a"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4088cab82b66a3b37ffc452976b14a3c599269c247895ae9ceb4066d8188a57"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-win32.whl", hash = "sha256:dcf000c36dd1651118a2462257e3a9e76db789a78432e1f303c7bac54f63ef6c"},
+    {file = "opencv_python-4.9.0.80-cp37-abi3-win_amd64.whl", hash = "sha256:3f16f08e02b2a2da44259c7cc712e779eff1dd8b55fdb0323e8cab09548086c0"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\""},
+    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
+    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
+]
+
 [[package]]
 name = "packaging"
 version = "23.2"
@ -3514,4 +3539,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.12"
-content-hash = "12ccff8a2521e7eb88eee82cfd3de409fea8e1658406d6148a42f9347ca7b2a7"
+content-hash = "5c8d587b405e97c0dca454078950157106f9aea687cbecce5b7ae7effd2aeece"
--- a/01OS/pyproject.toml
+++ b/01OS/pyproject.toml
@ -25,6 +25,7 @@ pydub = "^0.25.1"
 ngrok = "^1.0.0"
 open-interpreter = "^0.2.0"
 simpleaudio = "^1.0.4"
+opencv-python = "^4.9.0.80"

 [build-system]
 requires = ["poetry-core"]
--- a/01OS/start.sh
+++ b/01OS/start.sh
@ -1,5 +1,12 @@
 #!/usr/bin/env bash

+# Set python to prioritize the module files from the current directory
+# If we don't do this, then the python interpreter will not be able to find the modules,
+# and will throw an error like "ModuleNotFoundError: No module named '01OS'".
+# If we solve the problem by pip installing the official 01OS package, then those
+# modules will run instead of the local ones that we are trying to develop with.
+export PYTHONPATH="$(pwd):$PYTHONPATH"
+
 ### Import Environment Variables from .env
 SCRIPT_DIR="$(dirname "$0")"
 if [ ! -f "$SCRIPT_DIR/.env" ]; then