swarms/swarms/tools/stt.py

# speech to text tool

import os
import subprocess

import whisperx
from pydub import AudioSegment
from pytube import YouTube


class SpeechToText:
    def __init__(
        self,
        video_url,
        audio_format='mp3',
        device='cuda',
        batch_size=16,
        compute_type="float16",
        hf_api_key=None
    ):
        """
        # Example usage
        video_url = "url"
        speech_to_text = SpeechToText(video_url)
        transcription = speech_to_text.transcribe_youtube_video()
        print(transcription)

        """
        self.video_url = video_url
        self.audio_format = audio_format
        self.device = device
        self.batch_size = batch_size
        self.compute_type = compute_type
        self.hf_api_key = hf_api_key

    def install(self):
        subprocess.run(["pip", "install", "whisperx"])
        subprocess.run(["pip", "install", "pytube"])
        subprocess.run(["pip", "install", "pydub"])

    def download_youtube_video(self):
        audio_file = f'video.{self.audio_format}'

        # Download video 📥
        yt = YouTube(self.video_url)
        yt_stream = yt.streams.filter(only_audio=True).first()
        yt_stream.download(filename='video.mp4')

        # Convert video to audio 🎧
        video = AudioSegment.from_file("video.mp4", format="mp4")
        video.export(audio_file, format=self.audio_format)
        os.remove("video.mp4")

        return audio_file

    def transcribe_youtube_video(self):
        audio_file = self.download_youtube_video()

        device = "cuda"
        batch_size = 16
        compute_type = "float16"

        # 1. Transcribe with original Whisper (batched) 🗣️
        model = whisperx.load_model("large-v2", device, compute_type=compute_type)
        audio = whisperx.load_audio(audio_file)
        result = model.transcribe(audio, batch_size=batch_size)

        # 2. Align Whisper output 🔍
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

        # 3. Assign speaker labels 🏷️
        diarize_model = whisperx.DiarizationPipeline(
            use_auth_token=self.hf_api_key,
            device=device
        )
        diarize_model(audio_file)

        try:
            segments = result["segments"]
            transcription = " ".join(segment['text'] for segment in segments)
            return transcription
        except KeyError:
            print("The key 'segments' is not found in the result.")

    def transcribe(self, audio_file):
        model = whisperx.load_model(
            "large-v2",
            self.device,
            self.compute_type
        )
        audio = whisperx.load_audio(audio_file)
        result = model.transcribe(
            audio,
            batch_size=self.batch_size
        )

        # 2. Align Whisper output 🔍
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
        result = whisperx.align(
            result["segments"],
            model_a,
            metadata,
            audio,
            self.device,
            return_char_alignments=False
        )

        # 3. Assign speaker labels 🏷️
        diarize_model = whisperx.DiarizationPipeline(
            use_auth_token=self.hf_api_key,
            device=self.device
        )

        diarize_model(audio_file)

        try:
            segments = result["segments"]
            transcription = " ".join(segment['text'] for segment in segments)
            return transcription
        except KeyError:
            print("The key 'segments' is not found in the result.")