From 7582c8ad028b019ca19aa5cc3536c59accc88d9f Mon Sep 17 00:00:00 2001 From: Shiven Mian Date: Sat, 10 Feb 2024 19:29:11 -0800 Subject: [PATCH] feat: added local piper TTS --- .gitignore | 2 +- OS/01/conversations/user.json | 13 +++++++++- OS/01/start.sh | 45 ++++++++++++++++++++++++++++++++ OS/01/tts.py | 48 +++++++++++++++++++++++++---------- 4 files changed, 93 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 807399e..0a681f8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ ggml-*.bin - +OS/01/local_tts/* # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/OS/01/conversations/user.json b/OS/01/conversations/user.json index 92e81a2..13088ff 100644 --- a/OS/01/conversations/user.json +++ b/OS/01/conversations/user.json @@ -1 +1,12 @@ -[{"role": "user", "type": "message", "content": " Hey, how you doing?\n"}] \ No newline at end of file +[ + { + "role": "user", + "type": "message", + "content": " Hello, how are you doing?\n" + }, + { + "role": "assistant", + "type": "message", + "content": "I'm an artificial intelligence, so I don't have feelings, but thank you for asking. How may I assist you today?" + } +] \ No newline at end of file diff --git a/OS/01/start.sh b/OS/01/start.sh index 2940140..76cfcb3 100755 --- a/OS/01/start.sh +++ b/OS/01/start.sh @@ -6,6 +6,11 @@ export ALL_LOCAL=False # export WHISPER_MODEL_PATH=... # export OPENAI_API_KEY=sk-... +# For TTS, we use the en_US-lessac-medium voice model by default +# Please change the voice URL and voice name if you wish to use another voice +export PIPER_VOICE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/" +export PIPER_VOICE_NAME="en_US-lessac-medium.onnx" + # If SERVER_START, this is where we'll serve the server. # If DEVICE_START, this is where the device expects the server to be. export SERVER_URL=ws://localhost:8000/ @@ -22,6 +27,46 @@ export SERVER_EXPOSE_PUBLICALLY=False ### SETUP +# if using local models, install the models / executables +if [[ "$ALL_LOCAL" == "True" ]]; then + OS=$(uname -s) + ARCH=$(uname -m) + if [ "$OS" = "Darwin" ]; then + OS="macos" + if [ "$ARCH" = "arm64" ]; then + ARCH="aarch64" + elif [ "$ARCH" = "x86_64" ]; then + ARCH="x64" + else + echo "Piper: unsupported architecture" + fi + fi + PIPER_ASSETNAME="piper_${OS}_${ARCH}.tar.gz" + PIPER_URL="https://github.com/rhasspy/piper/releases/latest/download/" + mkdir local_tts + cd local_tts + curl -OL "${PIPER_URL}${PIPER_ASSETNAME}" + tar -xvzf $PIPER_ASSETNAME + cd piper + if [ "$OS" = "macos" ]; then + if [ "$ARCH" = "x64" ]; then + softwareupdate --install-rosetta --agree-to-license + fi + PIPER_PHONEMIZE_ASSETNAME="piper-phonemize_${OS}_${ARCH}.tar.gz" + PIPER_PHONEMIZE_URL="https://github.com/rhasspy/piper-phonemize/releases/latest/download/" + + curl -OL "${PIPER_PHONEMIZE_URL}${PIPER_PHONEMIZE_ASSETNAME}" + tar -xvzf $PIPER_PHONEMIZE_ASSETNAME + curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}" + curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}.json" + PIPER_DIR=`pwd` + install_name_tool -change @rpath/libespeak-ng.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libespeak-ng.1.dylib" "${PIPER_DIR}/piper" + install_name_tool -change @rpath/libonnxruntime.1.14.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libonnxruntime.1.14.1.dylib" "${PIPER_DIR}/piper" + install_name_tool -change @rpath/libpiper_phonemize.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libpiper_phonemize.1.dylib" "${PIPER_DIR}/piper" + fi + cd ../.. +fi + # (for dev, reset the ports we were using) SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+") diff --git a/OS/01/tts.py b/OS/01/tts.py index 024da65..366d380 100644 --- a/OS/01/tts.py +++ b/OS/01/tts.py @@ -7,20 +7,42 @@ from openai import OpenAI from pydub import AudioSegment from pydub.playback import play from playsound import playsound +import os +import subprocess +import tempfile client = OpenAI() +def run_command(command): + print(command) + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result + def tts(text, play_audio): - response = client.audio.speech.create( - model="tts-1", - voice="alloy", - input=text, - response_format="mp3" - ) - with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file: - response.stream_to_file(temp_file.name) - - if play_audio: - playsound(temp_file.name) - - return temp_file.read() + if os.getenv('ALL_LOCAL') == 'False': + response = client.audio.speech.create( + model="tts-1", + voice="alloy", + input=text, + response_format="mp3" + ) + with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file: + response.stream_to_file(temp_file.name) + + if play_audio: + playsound(temp_file.name) + + return temp_file.read() + else: + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: + output_file = temp_file.name + piper_dir = os.path.join(os.path.dirname(__file__), 'local_tts', 'piper') + subprocess.run([ + os.path.join(piper_dir, 'piper'), + '--model', os.path.join(piper_dir, os.getenv('PIPER_VOICE_NAME')), + '--output_file', output_file + ], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + if play_audio: + playsound(temp_file.name) + return temp_file.read()