swarms/apps/open-sourcerer/voice.py

import gradio_client as grc
import interpreter
import time
import gradio as gr
from pydub import AudioSegment
import io
from elevenlabs import generate, play, set_api_key
import whisper
import dotenv

dotenv.load_dotenv(".env")

# interpreter.model = "TheBloke/Mistral-7B-OpenOrca-GGUF"
interpreter.auto_run = True
model = whisper.load_model("base")


def transcribe(audio):

    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)

    # decode the audio
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    return result.text


set_api_key("ELEVEN_LABS_API_KEY")


def get_audio_length(audio_bytes):
    # Create a BytesIO object from the byte array
    byte_io = io.BytesIO(audio_bytes)

    # Load the audio data with PyDub
    audio = AudioSegment.from_mp3(byte_io)

    # Get the length of the audio in milliseconds
    length_ms = len(audio)

    # Optionally convert to seconds
    length_s = length_ms / 1000.0

    return length_s


def speak(text):
    speaking = True
    audio = generate(
        text=text,
        voice="Daniel"
    )
    play(audio, notebook=True)

    audio_length = get_audio_length(audio)
    time.sleep(audio_length)

# @title Text-only JARVIS
# @markdown Run this cell for a ChatGPT-like interface.


with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):

        user_message = history[-1][0]
        history[-1][1] = ""
        active_block_type = ""

        for chunk in interpreter.chat(user_message, stream=True, display=False):

            # Message
            if "message" in chunk:
                if active_block_type != "message":
                    active_block_type = "message"
                history[-1][1] += chunk["message"]
                yield history

            # Code
            if "language" in chunk:
                language = chunk["language"]
            if "code" in chunk:
                if active_block_type != "code":
                    active_block_type = "code"
                    history[-1][1] += f"\n```{language}\n"
                history[-1][1] += chunk["code"]
                yield history

            # Output
            if "executing" in chunk:
                history[-1][1] += "\n```\n\n```text\n"
                yield history
            if "output" in chunk:
                if chunk["output"] != "KeyboardInterrupt":
                    history[-1][1] += chunk["output"] + "\n"
                    yield history
            if "end_of_execution" in chunk:
                history[-1][1] = history[-1][1].strip()
                history[-1][1] += "\n```\n"
                yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )

if __name__ == '__main__':
    demo.queue()
    demo.launch(debug=True)