diff --git a/.gitignore b/.gitignore index 91c9774..ffe6b33 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ ggml-*.bin - +OS/01/local_tts/* # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/OS/01/start.sh b/OS/01/start.sh index 4884d9e..8a4dcc6 100755 --- a/OS/01/start.sh +++ b/OS/01/start.sh @@ -6,6 +6,11 @@ export ALL_LOCAL=False # export WHISPER_MODEL_PATH=... # export OPENAI_API_KEY=sk-... +# For TTS, we use the en_US-lessac-medium voice model by default +# Please change the voice URL and voice name if you wish to use another voice +export PIPER_VOICE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/" +export PIPER_VOICE_NAME="en_US-lessac-medium.onnx" + # If SERVER_START, this is where we'll serve the server. # If DEVICE_START, this is where the device expects the server to be. export SERVER_URL=ws://localhost:8000/ @@ -26,6 +31,46 @@ export LOG_LEVEL="INFO" ### SETUP +# if using local models, install the models / executables +if [[ "$ALL_LOCAL" == "True" ]]; then + OS=$(uname -s) + ARCH=$(uname -m) + if [ "$OS" = "Darwin" ]; then + OS="macos" + if [ "$ARCH" = "arm64" ]; then + ARCH="aarch64" + elif [ "$ARCH" = "x86_64" ]; then + ARCH="x64" + else + echo "Piper: unsupported architecture" + fi + fi + PIPER_ASSETNAME="piper_${OS}_${ARCH}.tar.gz" + PIPER_URL="https://github.com/rhasspy/piper/releases/latest/download/" + mkdir local_tts + cd local_tts + curl -OL "${PIPER_URL}${PIPER_ASSETNAME}" + tar -xvzf $PIPER_ASSETNAME + cd piper + if [ "$OS" = "macos" ]; then + if [ "$ARCH" = "x64" ]; then + softwareupdate --install-rosetta --agree-to-license + fi + PIPER_PHONEMIZE_ASSETNAME="piper-phonemize_${OS}_${ARCH}.tar.gz" + PIPER_PHONEMIZE_URL="https://github.com/rhasspy/piper-phonemize/releases/latest/download/" + + curl -OL "${PIPER_PHONEMIZE_URL}${PIPER_PHONEMIZE_ASSETNAME}" + tar -xvzf $PIPER_PHONEMIZE_ASSETNAME + curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}" + curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}.json" + PIPER_DIR=`pwd` + install_name_tool -change @rpath/libespeak-ng.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libespeak-ng.1.dylib" "${PIPER_DIR}/piper" + install_name_tool -change @rpath/libonnxruntime.1.14.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libonnxruntime.1.14.1.dylib" "${PIPER_DIR}/piper" + install_name_tool -change @rpath/libpiper_phonemize.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libpiper_phonemize.1.dylib" "${PIPER_DIR}/piper" + fi + cd ../.. +fi + # (for dev, reset the ports we were using) SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+") diff --git a/OS/01/tts.py b/OS/01/tts.py index 024da65..e51972a 100644 --- a/OS/01/tts.py +++ b/OS/01/tts.py @@ -7,20 +7,37 @@ from openai import OpenAI from pydub import AudioSegment from pydub.playback import play from playsound import playsound +import os +import subprocess +import tempfile client = OpenAI() def tts(text, play_audio): - response = client.audio.speech.create( - model="tts-1", - voice="alloy", - input=text, - response_format="mp3" - ) - with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file: - response.stream_to_file(temp_file.name) - - if play_audio: - playsound(temp_file.name) - - return temp_file.read() + if os.getenv('ALL_LOCAL') == 'False': + response = client.audio.speech.create( + model="tts-1", + voice="alloy", + input=text, + response_format="mp3" + ) + with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file: + response.stream_to_file(temp_file.name) + + if play_audio: + playsound(temp_file.name) + + return temp_file.read() + else: + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: + output_file = temp_file.name + piper_dir = os.path.join(os.path.dirname(__file__), 'local_tts', 'piper') + subprocess.run([ + os.path.join(piper_dir, 'piper'), + '--model', os.path.join(piper_dir, os.getenv('PIPER_VOICE_NAME')), + '--output_file', output_file + ], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + if play_audio: + playsound(temp_file.name) + return temp_file.read()