from __future__ import annotations import asyncio import base64 import io import threading from os import getenv from typing import Any, Awaitable, Callable, cast import numpy as np try: import pyaudio except ImportError: import subprocess subprocess.check_call(["pip", "install", "pyaudio"]) import pyaudio try: import sounddevice as sd except ImportError: import subprocess subprocess.check_call(["pip", "install", "sounddevice"]) import sounddevice as sd from loguru import logger from openai import AsyncOpenAI from openai.resources.beta.realtime.realtime import ( AsyncRealtimeConnection, ) from openai.types.beta.realtime.session import Session try: from pydub import AudioSegment except ImportError: import subprocess subprocess.check_call(["pip", "install", "pydub"]) from pydub import AudioSegment from dotenv import load_dotenv load_dotenv() CHUNK_LENGTH_S = 0.05 # 100ms SAMPLE_RATE = 24000 FORMAT = pyaudio.paInt16 CHANNELS = 1 # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false def audio_to_pcm16_base64(audio_bytes: bytes) -> bytes: # load the audio file from the byte stream audio = AudioSegment.from_file(io.BytesIO(audio_bytes)) print(f"Loaded audio: {audio.frame_rate=} {audio.channels=} {audio.sample_width=} {audio.frame_width=}") # resample to 24kHz mono pcm16 pcm_audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(CHANNELS).set_sample_width(2).raw_data return pcm_audio class AudioPlayerAsync: def __init__(self): self.queue = [] self.lock = threading.Lock() self.stream = sd.OutputStream( callback=self.callback, samplerate=SAMPLE_RATE, channels=CHANNELS, dtype=np.int16, blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE), ) self.playing = False self._frame_count = 0 def callback(self, outdata, frames, time, status): # noqa with self.lock: data = np.empty(0, dtype=np.int16) # get next item from queue if there is still space in the buffer while len(data) < frames and len(self.queue) > 0: item = self.queue.pop(0) frames_needed = frames - len(data) data = np.concatenate((data, item[:frames_needed])) if len(item) > frames_needed: self.queue.insert(0, item[frames_needed:]) self._frame_count += len(data) # fill the rest of the frames with zeros if there is no more data if len(data) < frames: data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))) outdata[:] = data.reshape(-1, 1) def reset_frame_count(self): self._frame_count = 0 def get_frame_count(self): return self._frame_count def add_data(self, data: bytes): with self.lock: # bytes is pcm16 single channel audio data, convert to numpy array np_data = np.frombuffer(data, dtype=np.int16) self.queue.append(np_data) if not self.playing: self.start() def start(self): self.playing = True self.stream.start() def stop(self): self.playing = False self.stream.stop() with self.lock: self.queue = [] def terminate(self): self.stream.close() async def send_audio_worker_sounddevice( connection: AsyncRealtimeConnection, should_send: Callable[[], bool] | None = None, start_send: Callable[[], Awaitable[None]] | None = None, ): sent_audio = False device_info = sd.query_devices() print(device_info) read_size = int(SAMPLE_RATE * 0.02) stream = sd.InputStream( channels=CHANNELS, samplerate=SAMPLE_RATE, dtype="int16", ) stream.start() try: while True: if stream.read_available < read_size: await asyncio.sleep(0) continue data, _ = stream.read(read_size) if should_send() if should_send else True: if not sent_audio and start_send: await start_send() await connection.send( {"type": "input_audio_buffer.append", "audio": base64.b64encode(data).decode("utf-8")} ) sent_audio = True elif sent_audio: print("Done, triggering inference") await connection.send({"type": "input_audio_buffer.commit"}) await connection.send({"type": "response.create", "response": {}}) sent_audio = False await asyncio.sleep(0) except KeyboardInterrupt: pass finally: stream.stop() stream.close() class RealtimeApp: """ A console-based application to handle real-time audio recording and streaming, connecting to OpenAI's GPT-4 Realtime API. Features: - Streams microphone input to the GPT-4 Realtime API. - Logs transcription results. - Sends text prompts to the GPT-4 Realtime API. """ def __init__(self, system_prompt: str = None) -> None: self.connection: AsyncRealtimeConnection | None = None self.session: Session | None = None self.client = AsyncOpenAI(api_key=getenv("OPENAI_API_KEY")) self.audio_player = AudioPlayerAsync() self.last_audio_item_id: str | None = None self.should_send_audio = asyncio.Event() self.connected = asyncio.Event() self.system_prompt = system_prompt async def initialize_text_prompt(self, text: str) -> None: """Initialize and send a text prompt to the OpenAI Realtime API.""" try: async with self.client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as conn: self.connection = conn await conn.session.update(session={"modalities": ["text"]}) await conn.conversation.item.create( item={ "type": "message", "role": "system", "content": [{"type": "input_text", "text": text}], } ) await conn.response.create() async for event in conn: if event.type == "response.text.delta": print(event.delta, flush=True, end="") elif event.type == "response.text.done": print() elif event.type == "response.done": break except Exception as e: logger.exception(f"Error initializing text prompt: {e}") async def handle_realtime_connection(self) -> None: """Handle the connection to the OpenAI Realtime API.""" try: async with self.client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as conn: self.connection = conn self.connected.set() logger.info("Connected to OpenAI Realtime API.") await conn.session.update(session={"turn_detection": {"type": "server_vad"}}) acc_items: dict[str, Any] = {} async for event in conn: if event.type == "session.created": self.session = event.session assert event.session.id is not None logger.info(f"Session created with ID: {event.session.id}") continue if event.type == "session.updated": self.session = event.session logger.info("Session updated.") continue if event.type == "response.audio.delta": if event.item_id != self.last_audio_item_id: self.audio_player.reset_frame_count() self.last_audio_item_id = event.item_id bytes_data = base64.b64decode(event.delta) self.audio_player.add_data(bytes_data) continue if event.type == "response.audio_transcript.delta": try: text = acc_items[event.item_id] except KeyError: acc_items[event.item_id] = event.delta else: acc_items[event.item_id] = text + event.delta logger.debug(f"Transcription updated: {acc_items[event.item_id]}") continue if event.type == "response.text.delta": print(event.delta, flush=True, end="") continue if event.type == "response.text.done": print() continue if event.type == "response.done": break except Exception as e: logger.exception(f"Error in realtime connection handler: {e}") async def _get_connection(self) -> AsyncRealtimeConnection: """Wait for and return the realtime connection.""" await self.connected.wait() assert self.connection is not None return self.connection async def send_text_prompt(self, text: str) -> None: """Send a text prompt to the OpenAI Realtime API.""" try: connection = await self._get_connection() if not self.session: logger.error("Session is not initialized. Cannot send prompt.") return logger.info(f"Sending prompt to the model: {text}") await connection.conversation.item.create( item={ "type": "message", "role": "user", "content": [{"type": "input_text", "text": text}], } ) await connection.response.create() except Exception as e: logger.exception(f"Error sending text prompt: {e}") async def send_mic_audio(self) -> None: """Stream microphone audio to the OpenAI Realtime API.""" import sounddevice as sd # type: ignore sent_audio = False try: read_size = int(SAMPLE_RATE * 0.02) stream = sd.InputStream( channels=CHANNELS, samplerate=SAMPLE_RATE, dtype="int16" ) stream.start() while True: if stream.read_available < read_size: await asyncio.sleep(0) continue await self.should_send_audio.wait() data, _ = stream.read(read_size) connection = await self._get_connection() if not sent_audio: asyncio.create_task(connection.send({"type": "response.cancel"})) sent_audio = True await connection.input_audio_buffer.append(audio=base64.b64encode(cast(Any, data)).decode("utf-8")) await asyncio.sleep(0) except Exception as e: logger.exception(f"Error in microphone audio streaming: {e}") finally: stream.stop() stream.close() async def run(self) -> None: """Start the application tasks.""" logger.info("Starting application tasks.") await asyncio.gather( # self.initialize_text_prompt(self.system_prompt), self.handle_realtime_connection(), self.send_mic_audio() ) if __name__ == "__main__": logger.add("realtime_app.log", rotation="10 MB", retention="10 days", level="DEBUG") logger.info("Starting RealtimeApp.") app = RealtimeApp() asyncio.run(app.run())