Merge pull request #19 from shivenmian/u/shivenmian/local

feat: added local TTS using Piper
2 years ago · 9384f68c66
parent fe8b3c1499 b5dcb40dde
commit 9384f68c66
3 changed files with 76 additions and 14 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
 ggml-*.bin
-
+OS/01/local_tts/*
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/OS/01/start.sh
+++ b/OS/01/start.sh
@ -6,6 +6,11 @@ export ALL_LOCAL=False
 # export WHISPER_MODEL_PATH=...
 # export OPENAI_API_KEY=sk-...

+# For TTS, we use the en_US-lessac-medium voice model by default
+# Please change the voice URL and voice name if you wish to use another voice
+export PIPER_VOICE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/"
+export PIPER_VOICE_NAME="en_US-lessac-medium.onnx"
+
 # If SERVER_START, this is where we'll serve the server.
 # If DEVICE_START, this is where the device expects the server to be.
 export SERVER_URL=ws://localhost:8000/
@ -26,6 +31,46 @@ export LOG_LEVEL="INFO"

 ### SETUP

+# if using local models, install the models / executables
+if [[ "$ALL_LOCAL" == "True" ]]; then
+    OS=$(uname -s)
+    ARCH=$(uname -m)
+    if [ "$OS" = "Darwin" ]; then
+        OS="macos"
+        if [ "$ARCH" = "arm64" ]; then
+            ARCH="aarch64"
+        elif [ "$ARCH" = "x86_64" ]; then
+            ARCH="x64"
+        else
+            echo "Piper: unsupported architecture"
+        fi
+    fi
+    PIPER_ASSETNAME="piper_${OS}_${ARCH}.tar.gz"
+    PIPER_URL="https://github.com/rhasspy/piper/releases/latest/download/"
+    mkdir local_tts
+    cd local_tts
+    curl -OL "${PIPER_URL}${PIPER_ASSETNAME}"
+    tar -xvzf $PIPER_ASSETNAME
+    cd piper
+    if [ "$OS" = "macos" ]; then
+        if [ "$ARCH" = "x64" ]; then
+            softwareupdate --install-rosetta --agree-to-license
+        fi
+        PIPER_PHONEMIZE_ASSETNAME="piper-phonemize_${OS}_${ARCH}.tar.gz"
+        PIPER_PHONEMIZE_URL="https://github.com/rhasspy/piper-phonemize/releases/latest/download/"
+
+        curl -OL "${PIPER_PHONEMIZE_URL}${PIPER_PHONEMIZE_ASSETNAME}"
+        tar -xvzf $PIPER_PHONEMIZE_ASSETNAME
+        curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}"
+        curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}.json"
+        PIPER_DIR=`pwd`
+        install_name_tool -change @rpath/libespeak-ng.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libespeak-ng.1.dylib" "${PIPER_DIR}/piper"
+        install_name_tool -change @rpath/libonnxruntime.1.14.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libonnxruntime.1.14.1.dylib" "${PIPER_DIR}/piper"
+        install_name_tool -change @rpath/libpiper_phonemize.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libpiper_phonemize.1.dylib" "${PIPER_DIR}/piper"
+    fi
+    cd ../..
+fi
+
 # (for dev, reset the ports we were using)

 SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+")
--- a/OS/01/tts.py
+++ b/OS/01/tts.py
@ -7,20 +7,37 @@ from openai import OpenAI
 from pydub import AudioSegment
 from pydub.playback import play
 from playsound import playsound
+import os
+import subprocess
+import tempfile

 client = OpenAI()

 def tts(text, play_audio):
-    response = client.audio.speech.create(
-        model="tts-1",
-        voice="alloy",
-        input=text,
-        response_format="mp3"
-    )
-    with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file:
-        response.stream_to_file(temp_file.name)
-        
-        if play_audio:
-            playsound(temp_file.name)
-        
-        return temp_file.read()
+    if os.getenv('ALL_LOCAL') == 'False':
+        response = client.audio.speech.create(
+            model="tts-1",
+            voice="alloy",
+            input=text,
+            response_format="mp3"
+        )
+        with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file:
+            response.stream_to_file(temp_file.name)
+            
+            if play_audio:
+                playsound(temp_file.name)
+            
+            return temp_file.read()
+    else:
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+            output_file = temp_file.name
+            piper_dir = os.path.join(os.path.dirname(__file__), 'local_tts', 'piper')
+            subprocess.run([
+                os.path.join(piper_dir, 'piper'),
+                '--model', os.path.join(piper_dir, os.getenv('PIPER_VOICE_NAME')),
+                '--output_file', output_file
+            ], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+
+            if play_audio:
+                playsound(temp_file.name)
+            return temp_file.read()