You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
2.3 KiB
97 lines
2.3 KiB
import os
|
|
import time
|
|
|
|
import pygame
|
|
import speech_recognition as sr
|
|
from dotenv import load_dotenv
|
|
from playsound import playsound
|
|
|
|
from swarms import OpenAIChat, OpenAITTS
|
|
|
|
# Load the environment variables
|
|
load_dotenv()
|
|
|
|
# Get the API key from the environment
|
|
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
|
|
|
# Initialize the language model
|
|
llm = OpenAIChat(
|
|
openai_api_key=openai_api_key,
|
|
)
|
|
|
|
# Initialize the text-to-speech model
|
|
tts = OpenAITTS(
|
|
model_name="tts-1-1106",
|
|
voice="onyx",
|
|
openai_api_key=openai_api_key,
|
|
saved_filepath="runs/tts_speech.wav",
|
|
)
|
|
|
|
# Initialize the speech recognition model
|
|
r = sr.Recognizer()
|
|
|
|
|
|
def play_audio(file_path):
|
|
# Check if the file exists
|
|
if not os.path.isfile(file_path):
|
|
print(f"Audio file {file_path} not found.")
|
|
return
|
|
|
|
# Initialize the mixer module
|
|
pygame.mixer.init()
|
|
|
|
try:
|
|
# Load the mp3 file
|
|
pygame.mixer.music.load(file_path)
|
|
|
|
# Play the mp3 file
|
|
pygame.mixer.music.play()
|
|
|
|
# Wait for the audio to finish playing
|
|
while pygame.mixer.music.get_busy():
|
|
pygame.time.Clock().tick(10)
|
|
except pygame.error as e:
|
|
print(f"Couldn't play {file_path}: {e}")
|
|
finally:
|
|
# Stop the mixer module and free resources
|
|
pygame.mixer.quit()
|
|
|
|
|
|
while True:
|
|
# Listen for user speech
|
|
with sr.Microphone() as source:
|
|
print("Listening...")
|
|
audio = r.listen(source)
|
|
|
|
# Convert speech to text
|
|
try:
|
|
print("Recognizing...")
|
|
task = r.recognize_google(audio)
|
|
print(f"User said: {task}")
|
|
except sr.UnknownValueError:
|
|
print("Could not understand audio")
|
|
continue
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
continue
|
|
|
|
# Run the Gemini model on the task
|
|
print("Running GPT4 model...")
|
|
out = llm(task)
|
|
print(f"Gemini output: {out}")
|
|
|
|
# Convert the Gemini output to speech
|
|
print("Running text-to-speech model...")
|
|
out = tts.run_and_save(out)
|
|
print(f"Text-to-speech output: {out}")
|
|
|
|
# Ask the user if they want to play the audio
|
|
# play_audio = input("Do you want to play the audio? (yes/no): ")
|
|
# if play_audio.lower() == "yes":
|
|
# Initialize the mixer module
|
|
# Play the audio file
|
|
|
|
time.sleep(5)
|
|
|
|
playsound("runs/tts_speech.wav")
|