parent
33e7f69450
commit
3a1a614d7b
@ -0,0 +1,74 @@
|
|||||||
|
# Build stage
|
||||||
|
FROM python:3.11-slim as builder
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy requirements from api folder
|
||||||
|
COPY api/requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir wheel && \
|
||||||
|
pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
|
||||||
|
|
||||||
|
# Final stage
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONUNBUFFERED=1 \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PATH="/app/venv/bin:$PATH" \
|
||||||
|
PYTHONPATH=/app \
|
||||||
|
PORT=8080
|
||||||
|
|
||||||
|
# Create app user
|
||||||
|
RUN useradd -m -s /bin/bash app && \
|
||||||
|
mkdir -p /app/logs && \
|
||||||
|
chown -R app:app /app
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy wheels from builder
|
||||||
|
COPY --from=builder /app/wheels /app/wheels
|
||||||
|
|
||||||
|
# Create and activate virtual environment
|
||||||
|
RUN python -m venv /app/venv && \
|
||||||
|
/app/venv/bin/pip install --no-cache-dir /app/wheels/*
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY --chown=app:app ./api ./api
|
||||||
|
|
||||||
|
# Switch to app user
|
||||||
|
USER app
|
||||||
|
|
||||||
|
# Create directories for logs
|
||||||
|
RUN mkdir -p /app/logs
|
||||||
|
|
||||||
|
# Required environment variables
|
||||||
|
ENV SUPABASE_URL="" \
|
||||||
|
SUPABASE_SERVICE_KEY="" \
|
||||||
|
ENVIRONMENT="production" \
|
||||||
|
LOG_LEVEL="info" \
|
||||||
|
WORKERS=4 \
|
||||||
|
MAX_REQUESTS_PER_MINUTE=60 \
|
||||||
|
API_KEY_LENGTH=32
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE $PORT
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:$PORT/health || exit 1
|
||||||
|
|
||||||
|
# Start command
|
||||||
|
CMD ["sh", "-c", "uvicorn api.api:app --host 0.0.0.0 --port $PORT --workers $WORKERS --log-level $LOG_LEVEL"]
|
@ -0,0 +1,353 @@
|
|||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import threading
|
||||||
|
from os import getenv
|
||||||
|
from typing import Any, Awaitable, Callable, cast
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pyaudio
|
||||||
|
except ImportError:
|
||||||
|
import subprocess
|
||||||
|
subprocess.check_call(["pip", "install", "pyaudio"])
|
||||||
|
import pyaudio
|
||||||
|
try:
|
||||||
|
import sounddevice as sd
|
||||||
|
except ImportError:
|
||||||
|
import subprocess
|
||||||
|
subprocess.check_call(["pip", "install", "sounddevice"])
|
||||||
|
import sounddevice as sd
|
||||||
|
from loguru import logger
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
from openai.resources.beta.realtime.realtime import (
|
||||||
|
AsyncRealtimeConnection,
|
||||||
|
)
|
||||||
|
from openai.types.beta.realtime.session import Session
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pydub import AudioSegment
|
||||||
|
except ImportError:
|
||||||
|
import subprocess
|
||||||
|
subprocess.check_call(["pip", "install", "pydub"])
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
CHUNK_LENGTH_S = 0.05 # 100ms
|
||||||
|
SAMPLE_RATE = 24000
|
||||||
|
FORMAT = pyaudio.paInt16
|
||||||
|
CHANNELS = 1
|
||||||
|
|
||||||
|
# pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
|
||||||
|
|
||||||
|
|
||||||
|
def audio_to_pcm16_base64(audio_bytes: bytes) -> bytes:
|
||||||
|
# load the audio file from the byte stream
|
||||||
|
audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
|
||||||
|
print(f"Loaded audio: {audio.frame_rate=} {audio.channels=} {audio.sample_width=} {audio.frame_width=}")
|
||||||
|
# resample to 24kHz mono pcm16
|
||||||
|
pcm_audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(CHANNELS).set_sample_width(2).raw_data
|
||||||
|
return pcm_audio
|
||||||
|
|
||||||
|
|
||||||
|
class AudioPlayerAsync:
|
||||||
|
def __init__(self):
|
||||||
|
self.queue = []
|
||||||
|
self.lock = threading.Lock()
|
||||||
|
self.stream = sd.OutputStream(
|
||||||
|
callback=self.callback,
|
||||||
|
samplerate=SAMPLE_RATE,
|
||||||
|
channels=CHANNELS,
|
||||||
|
dtype=np.int16,
|
||||||
|
blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE),
|
||||||
|
)
|
||||||
|
self.playing = False
|
||||||
|
self._frame_count = 0
|
||||||
|
|
||||||
|
def callback(self, outdata, frames, time, status): # noqa
|
||||||
|
with self.lock:
|
||||||
|
data = np.empty(0, dtype=np.int16)
|
||||||
|
|
||||||
|
# get next item from queue if there is still space in the buffer
|
||||||
|
while len(data) < frames and len(self.queue) > 0:
|
||||||
|
item = self.queue.pop(0)
|
||||||
|
frames_needed = frames - len(data)
|
||||||
|
data = np.concatenate((data, item[:frames_needed]))
|
||||||
|
if len(item) > frames_needed:
|
||||||
|
self.queue.insert(0, item[frames_needed:])
|
||||||
|
|
||||||
|
self._frame_count += len(data)
|
||||||
|
|
||||||
|
# fill the rest of the frames with zeros if there is no more data
|
||||||
|
if len(data) < frames:
|
||||||
|
data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16)))
|
||||||
|
|
||||||
|
outdata[:] = data.reshape(-1, 1)
|
||||||
|
|
||||||
|
def reset_frame_count(self):
|
||||||
|
self._frame_count = 0
|
||||||
|
|
||||||
|
def get_frame_count(self):
|
||||||
|
return self._frame_count
|
||||||
|
|
||||||
|
def add_data(self, data: bytes):
|
||||||
|
with self.lock:
|
||||||
|
# bytes is pcm16 single channel audio data, convert to numpy array
|
||||||
|
np_data = np.frombuffer(data, dtype=np.int16)
|
||||||
|
self.queue.append(np_data)
|
||||||
|
if not self.playing:
|
||||||
|
self.start()
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.playing = True
|
||||||
|
self.stream.start()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.playing = False
|
||||||
|
self.stream.stop()
|
||||||
|
with self.lock:
|
||||||
|
self.queue = []
|
||||||
|
|
||||||
|
def terminate(self):
|
||||||
|
self.stream.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def send_audio_worker_sounddevice(
|
||||||
|
connection: AsyncRealtimeConnection,
|
||||||
|
should_send: Callable[[], bool] | None = None,
|
||||||
|
start_send: Callable[[], Awaitable[None]] | None = None,
|
||||||
|
):
|
||||||
|
sent_audio = False
|
||||||
|
|
||||||
|
device_info = sd.query_devices()
|
||||||
|
print(device_info)
|
||||||
|
|
||||||
|
read_size = int(SAMPLE_RATE * 0.02)
|
||||||
|
|
||||||
|
stream = sd.InputStream(
|
||||||
|
channels=CHANNELS,
|
||||||
|
samplerate=SAMPLE_RATE,
|
||||||
|
dtype="int16",
|
||||||
|
)
|
||||||
|
stream.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
if stream.read_available < read_size:
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
continue
|
||||||
|
|
||||||
|
data, _ = stream.read(read_size)
|
||||||
|
|
||||||
|
if should_send() if should_send else True:
|
||||||
|
if not sent_audio and start_send:
|
||||||
|
await start_send()
|
||||||
|
await connection.send(
|
||||||
|
{"type": "input_audio_buffer.append", "audio": base64.b64encode(data).decode("utf-8")}
|
||||||
|
)
|
||||||
|
sent_audio = True
|
||||||
|
|
||||||
|
elif sent_audio:
|
||||||
|
print("Done, triggering inference")
|
||||||
|
await connection.send({"type": "input_audio_buffer.commit"})
|
||||||
|
await connection.send({"type": "response.create", "response": {}})
|
||||||
|
sent_audio = False
|
||||||
|
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
stream.stop()
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
class RealtimeApp:
|
||||||
|
"""
|
||||||
|
A console-based application to handle real-time audio recording and streaming,
|
||||||
|
connecting to OpenAI's GPT-4 Realtime API.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Streams microphone input to the GPT-4 Realtime API.
|
||||||
|
- Logs transcription results.
|
||||||
|
- Sends text prompts to the GPT-4 Realtime API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, system_prompt: str = None) -> None:
|
||||||
|
self.connection: AsyncRealtimeConnection | None = None
|
||||||
|
self.session: Session | None = None
|
||||||
|
self.client = AsyncOpenAI(api_key=getenv("OPENAI_API_KEY"))
|
||||||
|
self.audio_player = AudioPlayerAsync()
|
||||||
|
self.last_audio_item_id: str | None = None
|
||||||
|
self.should_send_audio = asyncio.Event()
|
||||||
|
self.connected = asyncio.Event()
|
||||||
|
self.system_prompt = system_prompt
|
||||||
|
|
||||||
|
async def initialize_text_prompt(self, text: str) -> None:
|
||||||
|
"""Initialize and send a text prompt to the OpenAI Realtime API."""
|
||||||
|
try:
|
||||||
|
async with self.client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as conn:
|
||||||
|
self.connection = conn
|
||||||
|
await conn.session.update(session={"modalities": ["text"]})
|
||||||
|
|
||||||
|
await conn.conversation.item.create(
|
||||||
|
item={
|
||||||
|
"type": "message",
|
||||||
|
"role": "system",
|
||||||
|
"content": [{"type": "input_text", "text": text}],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
await conn.response.create()
|
||||||
|
|
||||||
|
async for event in conn:
|
||||||
|
if event.type == "response.text.delta":
|
||||||
|
print(event.delta, flush=True, end="")
|
||||||
|
|
||||||
|
elif event.type == "response.text.done":
|
||||||
|
print()
|
||||||
|
|
||||||
|
elif event.type == "response.done":
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"Error initializing text prompt: {e}")
|
||||||
|
|
||||||
|
async def handle_realtime_connection(self) -> None:
|
||||||
|
"""Handle the connection to the OpenAI Realtime API."""
|
||||||
|
try:
|
||||||
|
async with self.client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as conn:
|
||||||
|
self.connection = conn
|
||||||
|
self.connected.set()
|
||||||
|
logger.info("Connected to OpenAI Realtime API.")
|
||||||
|
|
||||||
|
await conn.session.update(session={"turn_detection": {"type": "server_vad"}})
|
||||||
|
|
||||||
|
acc_items: dict[str, Any] = {}
|
||||||
|
|
||||||
|
async for event in conn:
|
||||||
|
if event.type == "session.created":
|
||||||
|
self.session = event.session
|
||||||
|
assert event.session.id is not None
|
||||||
|
logger.info(f"Session created with ID: {event.session.id}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "session.updated":
|
||||||
|
self.session = event.session
|
||||||
|
logger.info("Session updated.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "response.audio.delta":
|
||||||
|
if event.item_id != self.last_audio_item_id:
|
||||||
|
self.audio_player.reset_frame_count()
|
||||||
|
self.last_audio_item_id = event.item_id
|
||||||
|
|
||||||
|
bytes_data = base64.b64decode(event.delta)
|
||||||
|
self.audio_player.add_data(bytes_data)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "response.audio_transcript.delta":
|
||||||
|
try:
|
||||||
|
text = acc_items[event.item_id]
|
||||||
|
except KeyError:
|
||||||
|
acc_items[event.item_id] = event.delta
|
||||||
|
else:
|
||||||
|
acc_items[event.item_id] = text + event.delta
|
||||||
|
|
||||||
|
logger.debug(f"Transcription updated: {acc_items[event.item_id]}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "response.text.delta":
|
||||||
|
print(event.delta, flush=True, end="")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "response.text.done":
|
||||||
|
print()
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "response.done":
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"Error in realtime connection handler: {e}")
|
||||||
|
|
||||||
|
async def _get_connection(self) -> AsyncRealtimeConnection:
|
||||||
|
"""Wait for and return the realtime connection."""
|
||||||
|
await self.connected.wait()
|
||||||
|
assert self.connection is not None
|
||||||
|
return self.connection
|
||||||
|
|
||||||
|
async def send_text_prompt(self, text: str) -> None:
|
||||||
|
"""Send a text prompt to the OpenAI Realtime API."""
|
||||||
|
try:
|
||||||
|
connection = await self._get_connection()
|
||||||
|
if not self.session:
|
||||||
|
logger.error("Session is not initialized. Cannot send prompt.")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Sending prompt to the model: {text}")
|
||||||
|
await connection.conversation.item.create(
|
||||||
|
item={
|
||||||
|
"type": "message",
|
||||||
|
"role": "user",
|
||||||
|
"content": [{"type": "input_text", "text": text}],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
await connection.response.create()
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"Error sending text prompt: {e}")
|
||||||
|
|
||||||
|
async def send_mic_audio(self) -> None:
|
||||||
|
"""Stream microphone audio to the OpenAI Realtime API."""
|
||||||
|
import sounddevice as sd # type: ignore
|
||||||
|
|
||||||
|
sent_audio = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
read_size = int(SAMPLE_RATE * 0.02)
|
||||||
|
stream = sd.InputStream(
|
||||||
|
channels=CHANNELS, samplerate=SAMPLE_RATE, dtype="int16"
|
||||||
|
)
|
||||||
|
stream.start()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if stream.read_available < read_size:
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
continue
|
||||||
|
|
||||||
|
await self.should_send_audio.wait()
|
||||||
|
|
||||||
|
data, _ = stream.read(read_size)
|
||||||
|
|
||||||
|
connection = await self._get_connection()
|
||||||
|
if not sent_audio:
|
||||||
|
asyncio.create_task(connection.send({"type": "response.cancel"}))
|
||||||
|
sent_audio = True
|
||||||
|
|
||||||
|
await connection.input_audio_buffer.append(audio=base64.b64encode(cast(Any, data)).decode("utf-8"))
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"Error in microphone audio streaming: {e}")
|
||||||
|
finally:
|
||||||
|
stream.stop()
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
async def run(self) -> None:
|
||||||
|
"""Start the application tasks."""
|
||||||
|
logger.info("Starting application tasks.")
|
||||||
|
|
||||||
|
await asyncio.gather(
|
||||||
|
# self.initialize_text_prompt(self.system_prompt),
|
||||||
|
self.handle_realtime_connection(),
|
||||||
|
self.send_mic_audio()
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger.add("realtime_app.log", rotation="10 MB", retention="10 days", level="DEBUG")
|
||||||
|
logger.info("Starting RealtimeApp.")
|
||||||
|
app = RealtimeApp()
|
||||||
|
asyncio.run(app.run())
|
Loading…
Reference in new issue