Merge pull request #19 from shivenmian/u/shivenmian/local

feat: added local TTS using Piper
pull/21/head^2
killian 11 months ago committed by GitHub
commit 9384f68c66
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

2
.gitignore vendored

@ -1,5 +1,5 @@
ggml-*.bin ggml-*.bin
OS/01/local_tts/*
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]

@ -6,6 +6,11 @@ export ALL_LOCAL=False
# export WHISPER_MODEL_PATH=... # export WHISPER_MODEL_PATH=...
# export OPENAI_API_KEY=sk-... # export OPENAI_API_KEY=sk-...
# For TTS, we use the en_US-lessac-medium voice model by default
# Please change the voice URL and voice name if you wish to use another voice
export PIPER_VOICE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/"
export PIPER_VOICE_NAME="en_US-lessac-medium.onnx"
# If SERVER_START, this is where we'll serve the server. # If SERVER_START, this is where we'll serve the server.
# If DEVICE_START, this is where the device expects the server to be. # If DEVICE_START, this is where the device expects the server to be.
export SERVER_URL=ws://localhost:8000/ export SERVER_URL=ws://localhost:8000/
@ -26,6 +31,46 @@ export LOG_LEVEL="INFO"
### SETUP ### SETUP
# if using local models, install the models / executables
if [[ "$ALL_LOCAL" == "True" ]]; then
OS=$(uname -s)
ARCH=$(uname -m)
if [ "$OS" = "Darwin" ]; then
OS="macos"
if [ "$ARCH" = "arm64" ]; then
ARCH="aarch64"
elif [ "$ARCH" = "x86_64" ]; then
ARCH="x64"
else
echo "Piper: unsupported architecture"
fi
fi
PIPER_ASSETNAME="piper_${OS}_${ARCH}.tar.gz"
PIPER_URL="https://github.com/rhasspy/piper/releases/latest/download/"
mkdir local_tts
cd local_tts
curl -OL "${PIPER_URL}${PIPER_ASSETNAME}"
tar -xvzf $PIPER_ASSETNAME
cd piper
if [ "$OS" = "macos" ]; then
if [ "$ARCH" = "x64" ]; then
softwareupdate --install-rosetta --agree-to-license
fi
PIPER_PHONEMIZE_ASSETNAME="piper-phonemize_${OS}_${ARCH}.tar.gz"
PIPER_PHONEMIZE_URL="https://github.com/rhasspy/piper-phonemize/releases/latest/download/"
curl -OL "${PIPER_PHONEMIZE_URL}${PIPER_PHONEMIZE_ASSETNAME}"
tar -xvzf $PIPER_PHONEMIZE_ASSETNAME
curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}"
curl -OL "${PIPER_VOICE_URL}${PIPER_VOICE_NAME}.json"
PIPER_DIR=`pwd`
install_name_tool -change @rpath/libespeak-ng.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libespeak-ng.1.dylib" "${PIPER_DIR}/piper"
install_name_tool -change @rpath/libonnxruntime.1.14.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libonnxruntime.1.14.1.dylib" "${PIPER_DIR}/piper"
install_name_tool -change @rpath/libpiper_phonemize.1.dylib "${PIPER_DIR}/piper-phonemize/lib/libpiper_phonemize.1.dylib" "${PIPER_DIR}/piper"
fi
cd ../..
fi
# (for dev, reset the ports we were using) # (for dev, reset the ports we were using)
SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+") SERVER_PORT=$(echo $SERVER_URL | grep -oE "[0-9]+")

@ -7,20 +7,37 @@ from openai import OpenAI
from pydub import AudioSegment from pydub import AudioSegment
from pydub.playback import play from pydub.playback import play
from playsound import playsound from playsound import playsound
import os
import subprocess
import tempfile
client = OpenAI() client = OpenAI()
def tts(text, play_audio): def tts(text, play_audio):
response = client.audio.speech.create( if os.getenv('ALL_LOCAL') == 'False':
model="tts-1", response = client.audio.speech.create(
voice="alloy", model="tts-1",
input=text, voice="alloy",
response_format="mp3" input=text,
) response_format="mp3"
with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file: )
response.stream_to_file(temp_file.name) with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_file:
response.stream_to_file(temp_file.name)
if play_audio:
playsound(temp_file.name) if play_audio:
playsound(temp_file.name)
return temp_file.read()
return temp_file.read()
else:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
output_file = temp_file.name
piper_dir = os.path.join(os.path.dirname(__file__), 'local_tts', 'piper')
subprocess.run([
os.path.join(piper_dir, 'piper'),
'--model', os.path.join(piper_dir, os.getenv('PIPER_VOICE_NAME')),
'--output_file', output_file
], input=text, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if play_audio:
playsound(temp_file.name)
return temp_file.read()

Loading…
Cancel
Save