[CLEANUP][WhisperX]

2 years ago · d33588becc
parent 7cd6f25353
commit d33588becc
2 changed files with 0 additions and 360 deletions
--- a/swarms/models/whisperx_model.py
+++ b/swarms/models/whisperx_model.py
@ -1,138 +0,0 @@
-import os
-import subprocess
-
-try:
-    import whisperx
-    from pydub import AudioSegment
-    from pytube import YouTube
-except Exception as error:
-    print("Error importing pytube. Please install pytube manually.")
-    print("pip install pytube")
-    print("pip install pydub")
-    print("pip install whisperx")
-    print(f"Pytube error: {error}")
-
-
-class WhisperX:
-    def __init__(
-        self,
-        video_url,
-        audio_format="mp3",
-        device="cuda",
-        batch_size=16,
-        compute_type="float16",
-        hf_api_key=None,
-    ):
-        """
-        # Example usage
-        video_url = "url"
-        speech_to_text = WhisperX(video_url)
-        transcription = speech_to_text.transcribe_youtube_video()
-        print(transcription)
-
-        """
-        self.video_url = video_url
-        self.audio_format = audio_format
-        self.device = device
-        self.batch_size = batch_size
-        self.compute_type = compute_type
-        self.hf_api_key = hf_api_key
-
-    def install(self):
-        subprocess.run(["pip", "install", "whisperx"])
-        subprocess.run(["pip", "install", "pytube"])
-        subprocess.run(["pip", "install", "pydub"])
-
-    def download_youtube_video(self):
-        audio_file = f"video.{self.audio_format}"
-
-        # Download video 📥
-        yt = YouTube(self.video_url)
-        yt_stream = yt.streams.filter(only_audio=True).first()
-        yt_stream.download(filename="video.mp4")
-
-        # Convert video to audio 🎧
-        video = AudioSegment.from_file("video.mp4", format="mp4")
-        video.export(audio_file, format=self.audio_format)
-        os.remove("video.mp4")
-
-        return audio_file
-
-    def transcribe_youtube_video(self):
-        audio_file = self.download_youtube_video()
-
-        device = "cuda"
-        batch_size = 16
-        compute_type = "float16"
-
-        # 1. Transcribe with original Whisper (batched) 🗣️
-        model = whisperx.load_model(
-            "large-v2", device, compute_type=compute_type
-        )
-        audio = whisperx.load_audio(audio_file)
-        result = model.transcribe(audio, batch_size=batch_size)
-
-        # 2. Align Whisper output 🔍
-        model_a, metadata = whisperx.load_align_model(
-            language_code=result["language"], device=device
-        )
-        result = whisperx.align(
-            result["segments"],
-            model_a,
-            metadata,
-            audio,
-            device,
-            return_char_alignments=False,
-        )
-
-        # 3. Assign speaker labels 🏷️
-        diarize_model = whisperx.DiarizationPipeline(
-            use_auth_token=self.hf_api_key, device=device
-        )
-        diarize_model(audio_file)
-
-        try:
-            segments = result["segments"]
-            transcription = " ".join(
-                segment["text"] for segment in segments
-            )
-            return transcription
-        except KeyError:
-            print("The key 'segments' is not found in the result.")
-
-    def transcribe(self, audio_file):
-        model = whisperx.load_model(
-            "large-v2", self.device, self.compute_type
-        )
-        audio = whisperx.load_audio(audio_file)
-        result = model.transcribe(audio, batch_size=self.batch_size)
-
-        # 2. Align Whisper output 🔍
-        model_a, metadata = whisperx.load_align_model(
-            language_code=result["language"], device=self.device
-        )
-
-        result = whisperx.align(
-            result["segments"],
-            model_a,
-            metadata,
-            audio,
-            self.device,
-            return_char_alignments=False,
-        )
-
-        # 3. Assign speaker labels 🏷️
-        diarize_model = whisperx.DiarizationPipeline(
-            use_auth_token=self.hf_api_key, device=self.device
-        )
-
-        diarize_model(audio_file)
-
-        try:
-            segments = result["segments"]
-            transcription = " ".join(
-                segment["text"] for segment in segments
-            )
-            return transcription
-        except KeyError:
-            print("The key 'segments' is not found in the result.")
--- a/tests/models/test_whisperx.py
+++ b/tests/models/test_whisperx.py
@ -1,222 +0,0 @@
-import os
-import subprocess
-import tempfile
-from unittest.mock import patch
-
-import pytest
-import whisperx
-from pydub import AudioSegment
-from pytube import YouTube
-from swarms.models.whisperx_model import WhisperX
-
-
-# Fixture to create a temporary directory for testing
-@pytest.fixture
-def temp_dir():
-    with tempfile.TemporaryDirectory() as tempdir:
-        yield tempdir
-
-
-# Mock subprocess.run to prevent actual installation during tests
-@patch.object(subprocess, "run")
-def test_speech_to_text_install(mock_run):
-    stt = WhisperX("https://www.youtube.com/watch?v=MJd6pr16LRM")
-    stt.install()
-    mock_run.assert_called_with(["pip", "install", "whisperx"])
-
-
-# Mock pytube.YouTube and pytube.Streams for download tests
-@patch("pytube.YouTube")
-@patch.object(YouTube, "streams")
-def test_speech_to_text_download_youtube_video(
-    mock_streams, mock_youtube, temp_dir
-):
-    # Mock YouTube and streams
-    video_url = "https://www.youtube.com/watch?v=MJd6pr16LRM"
-    mock_stream = mock_streams().filter().first()
-    mock_stream.download.return_value = os.path.join(
-        temp_dir, "video.mp4"
-    )
-    mock_youtube.return_value = mock_youtube
-    mock_youtube.streams = mock_streams
-
-    stt = WhisperX(video_url)
-    audio_file = stt.download_youtube_video()
-
-    assert os.path.exists(audio_file)
-    assert audio_file.endswith(".mp3")
-
-
-# Mock whisperx.load_model and whisperx.load_audio for transcribe tests
-@patch("whisperx.load_model")
-@patch("whisperx.load_audio")
-@patch("whisperx.load_align_model")
-@patch("whisperx.align")
-@patch.object(whisperx.DiarizationPipeline, "__call__")
-def test_speech_to_text_transcribe_youtube_video(
-    mock_diarization,
-    mock_align,
-    mock_align_model,
-    mock_load_audio,
-    mock_load_model,
-    temp_dir,
-):
-    # Mock whisperx functions
-    mock_load_model.return_value = mock_load_model
-    mock_load_model.transcribe.return_value = {
-        "language": "en",
-        "segments": [{"text": "Hello, World!"}],
-    }
-
-    mock_load_audio.return_value = "audio_path"
-    mock_align_model.return_value = (mock_align_model, "metadata")
-    mock_align.return_value = {
-        "segments": [{"text": "Hello, World!"}]
-    }
-
-    # Mock diarization pipeline
-    mock_diarization.return_value = None
-
-    video_url = "https://www.youtube.com/watch?v=MJd6pr16LRM/video"
-    stt = WhisperX(video_url)
-    transcription = stt.transcribe_youtube_video()
-
-    assert transcription == "Hello, World!"
-
-
-# More tests for different scenarios and edge cases can be added here.
-
-
-# Test transcribe method with provided audio file
-def test_speech_to_text_transcribe_audio_file(temp_dir):
-    # Create a temporary audio file
-    audio_file = os.path.join(temp_dir, "test_audio.mp3")
-    AudioSegment.silent(duration=500).export(audio_file, format="mp3")
-
-    stt = WhisperX("https://www.youtube.com/watch?v=MJd6pr16LRM")
-    transcription = stt.transcribe(audio_file)
-
-    assert transcription == ""
-
-
-# Test transcribe method when Whisperx fails
-@patch("whisperx.load_model")
-@patch("whisperx.load_audio")
-def test_speech_to_text_transcribe_whisperx_failure(
-    mock_load_audio, mock_load_model, temp_dir
-):
-    # Mock whisperx functions to raise an exception
-    mock_load_model.side_effect = Exception("Whisperx failed")
-    mock_load_audio.return_value = "audio_path"
-
-    stt = WhisperX("https://www.youtube.com/watch?v=MJd6pr16LRM")
-    transcription = stt.transcribe("audio_path")
-
-    assert transcription == "Whisperx failed"
-
-
-# Test transcribe method with missing 'segments' key in Whisperx output
-@patch("whisperx.load_model")
-@patch("whisperx.load_audio")
-@patch("whisperx.load_align_model")
-@patch("whisperx.align")
-@patch.object(whisperx.DiarizationPipeline, "__call__")
-def test_speech_to_text_transcribe_missing_segments(
-    mock_diarization,
-    mock_align,
-    mock_align_model,
-    mock_load_audio,
-    mock_load_model,
-):
-    # Mock whisperx functions to return incomplete output
-    mock_load_model.return_value = mock_load_model
-    mock_load_model.transcribe.return_value = {"language": "en"}
-
-    mock_load_audio.return_value = "audio_path"
-    mock_align_model.return_value = (mock_align_model, "metadata")
-    mock_align.return_value = {}
-
-    # Mock diarization pipeline
-    mock_diarization.return_value = None
-
-    stt = WhisperX("https://www.youtube.com/watch?v=MJd6pr16LRM")
-    transcription = stt.transcribe("audio_path")
-
-    assert transcription == ""
-
-
-# Test transcribe method with Whisperx align failure
-@patch("whisperx.load_model")
-@patch("whisperx.load_audio")
-@patch("whisperx.load_align_model")
-@patch("whisperx.align")
-@patch.object(whisperx.DiarizationPipeline, "__call__")
-def test_speech_to_text_transcribe_align_failure(
-    mock_diarization,
-    mock_align,
-    mock_align_model,
-    mock_load_audio,
-    mock_load_model,
-):
-    # Mock whisperx functions to raise an exception during align
-    mock_load_model.return_value = mock_load_model
-    mock_load_model.transcribe.return_value = {
-        "language": "en",
-        "segments": [{"text": "Hello, World!"}],
-    }
-
-    mock_load_audio.return_value = "audio_path"
-    mock_align_model.return_value = (mock_align_model, "metadata")
-    mock_align.side_effect = Exception("Align failed")
-
-    # Mock diarization pipeline
-    mock_diarization.return_value = None
-
-    stt = WhisperX("https://www.youtube.com/watch?v=MJd6pr16LRM")
-    transcription = stt.transcribe("audio_path")
-
-    assert transcription == "Align failed"
-
-
-# Test transcribe_youtube_video when Whisperx diarization fails
-@patch("pytube.YouTube")
-@patch.object(YouTube, "streams")
-@patch("whisperx.DiarizationPipeline")
-@patch("whisperx.load_audio")
-@patch("whisperx.load_align_model")
-@patch("whisperx.align")
-def test_speech_to_text_transcribe_diarization_failure(
-    mock_align,
-    mock_align_model,
-    mock_load_audio,
-    mock_diarization,
-    mock_streams,
-    mock_youtube,
-    temp_dir,
-):
-    # Mock YouTube and streams
-    video_url = "https://www.youtube.com/watch?v=MJd6pr16LRM"
-    mock_stream = mock_streams().filter().first()
-    mock_stream.download.return_value = os.path.join(
-        temp_dir, "video.mp4"
-    )
-    mock_youtube.return_value = mock_youtube
-    mock_youtube.streams = mock_streams
-
-    # Mock whisperx functions
-    mock_load_audio.return_value = "audio_path"
-    mock_align_model.return_value = (mock_align_model, "metadata")
-    mock_align.return_value = {
-        "segments": [{"text": "Hello, World!"}]
-    }
-
-    # Mock diarization pipeline to raise an exception
-    mock_diarization.side_effect = Exception("Diarization failed")
-
-    stt = WhisperX(video_url)
-    transcription = stt.transcribe_youtube_video()
-
-    assert transcription == "Diarization failed"
-
-
-# Add more tests for other scenarios and edge cases as needed.