commit b0fe4b51cd326f8d00c68d499071ace84cd45565 Author: Artem Darius Weber Date: Sat Nov 23 15:51:12 2024 +0300 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..184325c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/chat +/models +/ollama diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fc9f18f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,79 @@ +FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +## +## User. +## + +RUN apt update && apt install -y sudo + +RUN groupadd -r user +RUN useradd -r -g user -m -s /bin/bash user +RUN usermod -aG sudo user + +RUN echo "user ALL = (ALL) NOPASSWD: ALL" >> /etc/sudoers + +USER user + +WORKDIR /home/user + +ENV USER=user + +## +## Time zone. +## + +ENV TZ=Europe/Moscow + +RUN sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +RUN echo $TZ | sudo tee /etc/timezone + +## +## RealTimeSTT. +## + +RUN sudo apt update && sudo apt install -y python3 +RUN sudo apt update && sudo apt install -y python3-pip +RUN sudo apt update && sudo apt install -y python3-venv +RUN sudo apt update && sudo apt install -y portaudio19-dev +RUN sudo apt update && sudo apt install -y ffmpeg + +RUN python3 -m venv venv + +RUN bash -c "source venv/bin/activate && pip install RealtimeSTT==0.3.7" +RUN bash -c "source venv/bin/activate && pip install torch==2.3.1+cu121 --index-url https://download.pytorch.org/whl/cu121" +RUN bash -c "source venv/bin/activate && pip install torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121" + +# Replace `localhost` with `0.0.0.0` in STT server. +RUN bash -c "source venv/bin/activate && \ + cd ~/venv/lib/python3.12/site-packages/RealtimeSTT_server && \ + find . -type f -exec sed -i.backup "s/localhost/0\.0\.0\.0/g" {} \;" + +## +## LLM. +## + +RUN bash -c "source venv/bin/activate && pip install llama-index==0.11.23" +RUN bash -c "source venv/bin/activate && pip install llama-index-llms-ollama==0.3.6" + +## +## RealTimeTTS. +## + +RUN sudo apt update && sudo apt install -y espeak # System TTS for TTS server. +RUN sudo apt update && sudo apt install -y git + +RUN bash -c "source venv/bin/activate && pip install 'RealTimeTTS[all]==0.4.10'" +RUN bash -c "source venv/bin/activate && pip install fastapi==0.115.5" # For TTS server. +RUN bash -c "source venv/bin/activate && pip install uvicorn==0.32.0" # For TTS server. + +RUN git clone https://github.com/KoljaB/RealtimeTTS && \ + cd RealtimeTTS && \ + git reset --hard b2fab8b57717d2a14501923e9cf2b5589944b9ca + +# Replace. +RUN bash -c "source venv/bin/activate && \ + cd RealtimeTTS/example_fast_api && \ + sed -i.backup \"s/START_ENGINE = SUPPORTED_ENGINES\[0\]/START_ENGINE = 'coqui'/g\" server.py" + diff --git a/client.py b/client.py new file mode 100644 index 0000000..97ca752 --- /dev/null +++ b/client.py @@ -0,0 +1,87 @@ +import threading +import socket +import pyaudio +import time + +# Server settings +SERVER_IP = '81.94.159.212' # Replace with your server's IP address +DATA_SERVER_PORT = 8012 +AUDIO_SERVER_PORT = 65432 + +# Audio settings +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 16000 # Should match the server's expected sample rate +CHUNK = 1024 + +audio = pyaudio.PyAudio() + +def record_and_send_audio(): + while True: + try: + # Connect to the server to send audio data + client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + client_socket.connect((SERVER_IP, DATA_SERVER_PORT)) + print("Connected to data server") + + # Initialize PyAudio for recording + stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) + + while True: + # Read audio data from the microphone + data = stream.read(CHUNK) + client_socket.sendall(data) + except Exception as e: + print(f"Error sending audio: {e}") + time.sleep(1) # Wait before retrying + finally: + # Clean up resources + if 'stream' in locals(): + stream.stop_stream() + stream.close() + if 'client_socket' in locals(): + client_socket.close() + +def receive_and_play_audio(): + while True: + try: + # Connect to the server to receive audio data + client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + client_socket.connect((SERVER_IP, AUDIO_SERVER_PORT)) + print("Connected to audio server") + + # Initialize PyAudio for playback + TTS_SAMPLE_RATE = 24000 # Should match the TTS sample rate used on the server + stream = audio.open(format=FORMAT, channels=CHANNELS, rate=TTS_SAMPLE_RATE, output=True) + + while True: + # Receive audio data from the server + data = client_socket.recv(CHUNK) + if not data: + raise ConnectionError("Audio server disconnected") + # Play the audio data + stream.write(data) + except Exception as e: + print(f"Error receiving audio: {e}") + time.sleep(1) # Wait before retrying + finally: + # Clean up resources + if 'stream' in locals(): + stream.stop_stream() + stream.close() + if 'client_socket' in locals(): + client_socket.close() + +def main(): + # Start the thread to receive and play audio + audio_thread = threading.Thread(target=receive_and_play_audio, daemon=True) + audio_thread.start() + + # Start recording and sending audio + while True: + record_and_send_audio() + print("Reconnecting to data server...") + time.sleep(1) + +if __name__ == '__main__': + main() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..f81302d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,62 @@ +services: + ai: + build: + context: . + dockerfile: Dockerfile + ports: + - "8012:8012" # STT server data. + - "65432:65432" # TTS client server. + volumes: + - .:/app + - ./models:/home/user/models + - ./chat:/home/user/chat + depends_on: + - ollama + command: ["bash", "-c", " + sudo chown user:user -R /home/user/models && \ + sudo chown user:user -R /home/user/chat && \ + source venv/bin/activate && \ + python /app/server.py \ + "] + stdin_open: true + tty: true + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ollama: + volumes: + - ./ollama/ollama:/root/.ollama + image: ollama/ollama:latest + ports: + - 7869:11434 + environment: + - OLLAMA_KEEP_ALIVE=24h + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ollama-webui: + image: ghcr.io/open-webui/open-webui:main + volumes: + - ./ollama/ollama-webui:/app/backend/data + depends_on: + - ollama + ports: + - 8080:8080 + environment: + - OLLAMA_BASE_URLS=http://host.docker.internal:7869 + - ENV=dev + - WEBUI_AUTH=False + - WEBUI_NAME=WebUI + - WEBUI_URL=http://localhost:8080 + - WEBUI_SECRET_KEY=t0p-s3cr3t + extra_hosts: + - host.docker.internal:host-gateway + diff --git a/old/LLM.dockerfile b/old/LLM.dockerfile new file mode 100644 index 0000000..4467be3 --- /dev/null +++ b/old/LLM.dockerfile @@ -0,0 +1,47 @@ +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +## +## User. +## + +RUN apt update && apt install -y sudo + +RUN groupadd -r user +RUN useradd -r -g user -m -s /bin/bash user +RUN usermod -aG sudo user + +RUN echo "user ALL = (ALL) NOPASSWD: ALL" >> /etc/sudoers + +USER user + +WORKDIR /home/user + +ENV USER=user + +## +## Time zone. +## + +ENV TZ=Europe/Moscow + +RUN sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +RUN echo $TZ | sudo tee /etc/timezone + +## +## ... +## + +RUN sudo apt update && sudo apt install -y python3 +RUN sudo apt update && sudo apt install -y python3-pip +RUN sudo apt update && sudo apt install -y python3-venv +RUN sudo apt update && sudo apt install -y portaudio19-dev + +RUN python3 -m venv venv + +RUN bash -c "source venv/bin/activate && pip install llama-index==0.11.23" +RUN bash -c "source venv/bin/activate && pip install llama-index-llms-ollama==0.3.6" +RUN bash -c "source venv/bin/activate && pip install websocket-client==1.8.0" +RUN bash -c "source venv/bin/activate && pip install websockets==14.1" + diff --git a/old/STT-LLM-TTS.dockerfile b/old/STT-LLM-TTS.dockerfile new file mode 100644 index 0000000..7f78e94 --- /dev/null +++ b/old/STT-LLM-TTS.dockerfile @@ -0,0 +1,79 @@ +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +## +## User. +## + +RUN apt update && apt install -y sudo + +RUN groupadd -r user +RUN useradd -r -g user -m -s /bin/bash user +RUN usermod -aG sudo user + +RUN echo "user ALL = (ALL) NOPASSWD: ALL" >> /etc/sudoers + +USER user + +WORKDIR /home/user + +ENV USER=user + +## +## Time zone. +## + +ENV TZ=Europe/Moscow + +RUN sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +RUN echo $TZ | sudo tee /etc/timezone + +## +## RealTimeSTT. +## + +RUN sudo apt update && sudo apt install -y python3 +RUN sudo apt update && sudo apt install -y python3-pip +RUN sudo apt update && sudo apt install -y python3-venv +RUN sudo apt update && sudo apt install -y portaudio19-dev +RUN sudo apt update && sudo apt install -y ffmpeg + +RUN python3 -m venv venv + +RUN bash -c "source venv/bin/activate && pip install RealtimeSTT==0.3.7" +RUN bash -c "source venv/bin/activate && pip install torch==2.3.1+cu121 --index-url https://download.pytorch.org/whl/cu121" +RUN bash -c "source venv/bin/activate && pip install torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121" + +# Replace `localhost` with `0.0.0.0` in STT server. +RUN bash -c "source venv/bin/activate && \ + cd ~/venv/lib/python3.12/site-packages/RealtimeSTT_server && \ + find . -type f -exec sed -i.backup "s/localhost/0\.0\.0\.0/g" {} \;" + +## +## LLM. +## + +RUN bash -c "source venv/bin/activate && pip install llama-index==0.11.23" +RUN bash -c "source venv/bin/activate && pip install llama-index-llms-ollama==0.3.6" + +## +## RealTimeTTS. +## + +RUN sudo apt update && sudo apt install -y espeak # System TTS for TTS server. +RUN sudo apt update && sudo apt install -y git + +RUN bash -c "source venv/bin/activate && pip install 'RealTimeTTS[all]==0.4.10'" +RUN bash -c "source venv/bin/activate && pip install fastapi==0.115.5" # For TTS server. +RUN bash -c "source venv/bin/activate && pip install uvicorn==0.32.0" # For TTS server. + +RUN git clone https://github.com/KoljaB/RealtimeTTS && \ + cd RealtimeTTS && \ + git reset --hard b2fab8b57717d2a14501923e9cf2b5589944b9ca + +# Replace. +RUN bash -c "source venv/bin/activate && \ + cd RealtimeTTS/example_fast_api && \ + sed -i.backup \"s/START_ENGINE = SUPPORTED_ENGINES\[0\]/START_ENGINE = 'coqui'/g\" server.py" + diff --git a/old/STT.dockerfile b/old/STT.dockerfile new file mode 100644 index 0000000..44db8d7 --- /dev/null +++ b/old/STT.dockerfile @@ -0,0 +1,56 @@ +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +## +## User. +## + +RUN apt update && apt install -y sudo + +RUN groupadd -r user +RUN useradd -r -g user -m -s /bin/bash user +RUN usermod -aG sudo user + +RUN echo "user ALL = (ALL) NOPASSWD: ALL" >> /etc/sudoers + +USER user + +WORKDIR /home/user + +ENV USER=user + +## +## Time zone. +## + +ENV TZ=Europe/Moscow + +RUN sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +RUN echo $TZ | sudo tee /etc/timezone + +## +## RealTimeSTT. +## + +RUN sudo apt update && sudo apt install -y python3 +RUN sudo apt update && sudo apt install -y python3-pip +RUN sudo apt update && sudo apt install -y python3-venv +RUN sudo apt update && sudo apt install -y portaudio19-dev +RUN sudo apt update && sudo apt install -y ffmpeg + +RUN python3 -m venv venv + +RUN bash -c "source venv/bin/activate && pip install RealtimeSTT==0.3.7" +RUN bash -c "source venv/bin/activate && pip install torch==2.3.1+cu121 --index-url https://download.pytorch.org/whl/cu121" +RUN bash -c "source venv/bin/activate && pip install torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121" + +## +## Replace `localhost` with `0.0.0.0` in STT server. +## + +RUN bash -c "source venv/bin/activate && \ + cd ~/venv/lib/python3.12/site-packages/RealtimeSTT_server && \ + find . -type f -exec sed -i.backup "s/localhost/0\.0\.0\.0/g" {} \;" + + diff --git a/old/TTS.dockerfile b/old/TTS.dockerfile new file mode 100644 index 0000000..9f23a20 --- /dev/null +++ b/old/TTS.dockerfile @@ -0,0 +1,63 @@ +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +## +## User. +## + +RUN apt update && apt install -y sudo + +RUN groupadd -r user +RUN useradd -r -g user -m -s /bin/bash user +RUN usermod -aG sudo user + +RUN echo "user ALL = (ALL) NOPASSWD: ALL" >> /etc/sudoers + +USER user + +WORKDIR /home/user + +ENV USER=user + +## +## Time zone. +## + +ENV TZ=Europe/Moscow + +RUN sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +RUN echo $TZ | sudo tee /etc/timezone + +## +## RealTimeTTS. +## + +RUN sudo apt update && sudo apt install -y python3 +RUN sudo apt update && sudo apt install -y python3-pip +RUN sudo apt update && sudo apt install -y python3-venv +RUN sudo apt update && sudo apt install -y portaudio19-dev +RUN sudo apt update && sudo apt install -y ffmpeg +RUN sudo apt update && sudo apt install -y espeak # System TTS for TTS server. +RUN sudo apt update && sudo apt install -y git + +RUN python3 -m venv venv + +RUN bash -c "source venv/bin/activate && pip install 'RealTimeTTS[all]==0.4.10'" +RUN bash -c "source venv/bin/activate && pip install torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121" +RUN bash -c "source venv/bin/activate && pip install torch==2.3.1+cu121 --index-url https://download.pytorch.org/whl/cu121" +RUN bash -c "source venv/bin/activate && pip install fastapi==0.115.5" # For TTS server. +RUN bash -c "source venv/bin/activate && pip install uvicorn==0.32.0" # For TTS server. + +RUN git clone --depth 1 https://github.com/KoljaB/RealtimeTTS && \ + cd RealtimeTTS && \ + git reset --hard b2fab8b57717d2a14501923e9cf2b5589944b9ca + +## +## Replaces. +## + +RUN bash -c "source venv/bin/activate && \ + cd RealtimeTTS/example_fast_api && \ + sed -i.backup \"s/START_ENGINE = SUPPORTED_ENGINES\[0\]/START_ENGINE = 'coqui'/g\" server.py" + diff --git a/old/docker-compose.yml b/old/docker-compose.yml new file mode 100644 index 0000000..a5ed898 --- /dev/null +++ b/old/docker-compose.yml @@ -0,0 +1,85 @@ +services: + tts: + build: + context: . + dockerfile: TTS.dockerfile + ports: + - "8000:8000" # TTS server. + command: ["bash", "-c", "source venv/bin/activate && cd RealtimeTTS/example_fast_api && python server.py"] + env_file: .env + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + stt: + build: + context: . + dockerfile: STT.dockerfile + ports: + - "8011:8011" # STT server control. + - "8012:8012" # STT server data. + command: ["bash", "-c", "source venv/bin/activate && stt-server --silero_deactivity_detection"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + llm: + build: + context: . + dockerfile: LLM.dockerfile + ports: + - "8013:8012" # STT server data. + - "65432:65432" # TTS client server. + volumes: + - .:/app + command: ["bash", "-c", "source venv/bin/activate && python /app/main.py"] + depends_on: + - tts + - stt + - ollama + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ollama: + volumes: + - ./ollama/ollama:/root/.ollama + image: ollama/ollama:latest + ports: + - 7869:11434 + environment: + - OLLAMA_KEEP_ALIVE=24h + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ollama-webui: + image: ghcr.io/open-webui/open-webui:main + volumes: + - ./ollama/ollama-webui:/app/backend/data + depends_on: + - ollama + ports: + - 8080:8080 + environment: + - OLLAMA_BASE_URLS=http://host.docker.internal:7869 + - ENV=dev + - WEBUI_AUTH=False + - WEBUI_NAME=WebUI + - WEBUI_URL=http://localhost:8080 + - WEBUI_SECRET_KEY=t0p-s3cr3t + extra_hosts: + - host.docker.internal:host-gateway + diff --git a/server.py b/server.py new file mode 100644 index 0000000..177f2de --- /dev/null +++ b/server.py @@ -0,0 +1,302 @@ +import asyncio +import threading +import time +import socket +import queue # Import the standard threading Queue +from llama_index.core.chat_engine import SimpleChatEngine +from llama_index.core.storage.chat_store import SimpleChatStore +from llama_index.core.memory import ChatMemoryBuffer +import TTS.tts.utils.text.cleaners as cleaners +import re + +# Import necessary modules for STT, LLM, and TTS +from RealtimeSTT import AudioToTextRecorder +from RealtimeTTS import TextToAudioStream, CoquiEngine # You can use another TTS engine if preferred +from llama_index.llms.ollama import Ollama + +# Settings for audio socket +AUDIO_SERVER_IP = '0.0.0.0' +AUDIO_SERVER_PORT = 65432 +DATA_SERVER_PORT = 8012 + +# Global variables +recorder = None +prev_text = "" +last_text_change_time = time.time() +text_stable_duration = 1 # Time duration without text changes to trigger LLM +audio_clients = [] # List of connected audio clients +audio_clients_lock = threading.Lock() +is_llm_processing = False +is_interrupted = False # New variable to track interruption +llm_tts_task = None # Task for LLM and TTS processing +loop = None # Event loop + +# Function to process detected text +def text_detected(text): + global prev_text, last_text_change_time, is_llm_processing, is_interrupted, llm_tts_task + text = text.strip() + if text != prev_text: + prev_text = text + last_text_change_time = time.time() + print(f"Realtime text: {text}") + if is_llm_processing: + is_interrupted = True + if llm_tts_task and not llm_tts_task.done(): + llm_tts_task.cancel() + tts_stream.stop() + print("LLM and TTS have been interrupted due to new user input.") + +async def handle_llm_and_tts(prompt): + global is_llm_processing, is_interrupted, llm_tts_task + is_llm_processing = True + is_interrupted = False + print(f"Sending to LLM: {prompt}") + + q = queue.Queue() + + def llm_streaming(): + response = chat.stream_chat(prompt) + for completion in response.response_gen: + if is_interrupted: + print("\nLLM generation interrupted.") + break + completion = cleaners.replace_symbols(completion, lang=None) + completion = cleaners.remove_aux_symbols(completion) + completion = re.sub(r"[\*]+", "", completion) + completion = re.sub(r'[^a-zA-Zа-яА-ЯёЁ0-9\s.,!?;:\'\"\*-]', '', completion) + completion = re.sub(r'\s+', ' ', completion) + + # Put completion into the queue + q.put(completion) + print(completion, end='', flush=True) + chat_store.persist(persist_path="~/chat/chat_store.json") + # Signal that LLM streaming is done + q.put(None) + + # Start llm_streaming in a separate thread + threading.Thread(target=llm_streaming, daemon=True).start() + + def text_stream(): + while True: + if is_interrupted: + break + try: + delta = q.get(timeout=0.1) + if delta is None: + break + yield delta + except queue.Empty: + continue + + tts_stream.feed(text_stream()) + try: + await play_and_send_audio() + except asyncio.CancelledError: + print("LLM and TTS task was cancelled.") + is_llm_processing = False + +async def play_and_send_audio(): + global is_interrupted + def on_audio_chunk(chunk): + if is_interrupted: + return + with audio_clients_lock: + for client_socket in audio_clients: + try: + client_socket.sendall(chunk) + except Exception as e: + print(f"Error sending audio to client: {e}") + audio_clients.remove(client_socket) + tts_stream.play(on_audio_chunk=on_audio_chunk, muted=True) + +# Function to start audio socket server +def start_audio_server(): + server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # Reuse address + server_socket.bind((AUDIO_SERVER_IP, AUDIO_SERVER_PORT)) + server_socket.listen() + print(f"Audio server started on {AUDIO_SERVER_IP}:{AUDIO_SERVER_PORT}") + + while True: + client_socket, addr = server_socket.accept() + print(f"Audio client connected from {addr}") + # Add client socket to list with thread-safe lock + with audio_clients_lock: + audio_clients.append(client_socket) + # Start a thread to handle client disconnection + threading.Thread(target=handle_client_disconnection, args=(client_socket,), daemon=True).start() + +# Function to handle client disconnection +def handle_client_disconnection(client_socket): + try: + # Keep the connection open + while True: + data = client_socket.recv(1024) + if not data: + break + except Exception as e: + print(f"Client disconnected: {e}") + finally: + with audio_clients_lock: + if client_socket in audio_clients: + audio_clients.remove(client_socket) + client_socket.close() + print("Client socket closed") + +# Function to receive audio data from clients +def start_data_server(): + data_server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + data_server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + data_server_socket.bind((AUDIO_SERVER_IP, DATA_SERVER_PORT)) + data_server_socket.listen() + print(f"Data server started on {AUDIO_SERVER_IP}:{DATA_SERVER_PORT}") + + while True: + client_socket, addr = data_server_socket.accept() + print(f"Data client connected from {addr}") + threading.Thread(target=handle_data_client, args=(client_socket,), daemon=True).start() + +# Function to handle data client +def handle_data_client(client_socket): + global recorder + try: + while True: + data = client_socket.recv(4096) + if not data: + break + # Feed data to the recorder + recorder.feed_audio(data) + except Exception as e: + print(f"Data client error: {e}") + finally: + client_socket.close() + print("Data client socket closed") + +def recorder_loop(): + global recorder + + def process_text(text): + pass # You can implement any processing here if needed + try: + while True: + recorder.text(process_text) + except Exception as e: + print(e) + +async def monitor_text_stability(): + global prev_text, last_text_change_time, llm_tts_task, is_interrupted + while True: + await asyncio.sleep(0.1) + if prev_text != "" and time.time() - last_text_change_time >= text_stable_duration: + text_to_send = prev_text + prev_text = "" + # Cancel any ongoing LLM and TTS task + if llm_tts_task and not llm_tts_task.done(): + is_interrupted = True + llm_tts_task.cancel() + tts_stream.stop() + is_interrupted = False + # Start a new LLM and TTS task + llm_tts_task = asyncio.create_task(handle_llm_and_tts(text_to_send)) + +# Function for main loop +def main(): + global recorder, loop + # Initialize the event loop + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + # Initialize recorder with use_microphone=False + recorder = AudioToTextRecorder( + model='large-v2', + # realtime_model_type='tiny.en', + # realtime_model_type='medium', + realtime_model_type='large-v3', + language='ru', + input_device_index=1, + silero_sensitivity=0.05, + silero_use_onnx=False, + webrtc_sensitivity=3, + post_speech_silence_duration=0.7, # This corresponds to unknown_sentence_detection_pause + min_length_of_recording=1.1, + min_gap_between_recordings=0, + enable_realtime_transcription=True, + realtime_processing_pause=0.02, + silero_deactivity_detection=True, + early_transcription_on_silence=0.2, + beam_size=5, + beam_size_realtime=3, + # initial_prompt="Incomplete thoughts should end with '...'. Examples of complete thoughts: 'The sky is blue.' 'She walked home.' Examples of incomplete thoughts: 'When the sky...' 'Because he...'", + initial_prompt="", + wake_words="", + wake_words_sensitivity=0.5, + wake_word_timeout=5.0, + wake_word_activation_delay=20, + wakeword_backend='none', + openwakeword_model_paths=None, + openwakeword_inference_framework='tensorflow', + wake_word_buffer_duration=1.0, + use_main_model_for_realtime=False, + spinner=False, + use_microphone=False, # Important: We receive audio from client, not from microphone + on_realtime_transcription_update=text_detected, # Assuming make_callback is not used here + use_extended_logging=False, + ) + + # Start audio server in a separate thread + audio_server_thread = threading.Thread(target=start_audio_server, daemon=True) + audio_server_thread.start() + + # Start data server in a separate thread + data_server_thread = threading.Thread(target=start_data_server, daemon=True) + data_server_thread.start() + + # Start recorder in a separate thread + recorder_thread = threading.Thread(target=recorder_loop, daemon=True) + recorder_thread.start() + + # Schedule the text stability monitoring task + loop.create_task(monitor_text_stability()) + + # Start the event loop + try: + loop.run_forever() + except KeyboardInterrupt: + print("Server is shutting down...") + finally: + # Stop recorder and close resources + recorder.stop() + recorder.shutdown() + loop.stop() + loop.close() + +if __name__ == '__main__': + + chat_store = SimpleChatStore.from_persist_path( + persist_path="~/chat/chat_store.json" + ) + + chat_memory = ChatMemoryBuffer.from_defaults( + token_limit=8192, + chat_store=chat_store, + chat_store_key="User", + ) + + # Settings for LLM and TTS + # Initialize Ollama LLM + LLM = Ollama(model="gemma2:9b", base_url="http://ollama:11434") + + prompt1 = """ +You are a friendly and helpful female voice assistant. You are aware that you are communicating through voice, so your responses should be clear, concise, and conversational, as if you are having a natural spoken conversation. Use a warm and approachable tone. Do not use any special symbols or formatting, such as lists. Just speak as if it's a regular dialogue. Always be ready to assist with follow-up questions or actions. Here are examples of how you might respond: +Remember to keep your responses short and engaging, and always be ready to assist further if needed. Avoid using any special symbols or formatting to ensure smooth text-to-speech conversion. +""" + + chat = SimpleChatEngine.from_defaults(llm=LLM, memory=chat_memory, system_prompt=prompt1) + + # Initialize TTS engine + # TTS_ENGINE = CoquiEngine(voice="Alma María") + TTS_ENGINE = CoquiEngine(voice="Chandra MacFarland") + tts_stream = TextToAudioStream(TTS_ENGINE, muted=True) + + main() + diff --git a/setup-NVIDIA.sh b/setup-NVIDIA.sh new file mode 100755 index 0000000..52aadd4 --- /dev/null +++ b/setup-NVIDIA.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + +sudo apt update + +sudo apt install -y nvidia-container-toolkit + +sudo nvidia-ctk runtime configure --runtime=docker + +sudo systemctl restart docker +