From 903eeeb1a0ca0b2da11467fa9bb663635ad53929 Mon Sep 17 00:00:00 2001
From: Kye <kye@apacmediasolutions.com>
Date: Tue, 26 Sep 2023 09:22:00 -0400
Subject: [PATCH] speech to text tool

---
 swarms/agents/multi_modal_agent.py |   1 -
 swarms/tools/stt.py                | 116 +++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 swarms/tools/stt.py

diff --git a/swarms/agents/multi_modal_agent.py b/swarms/agents/multi_modal_agent.py
index 31de5ece..2b9eb5a6 100644
--- a/swarms/agents/multi_modal_agent.py
+++ b/swarms/agents/multi_modal_agent.py
@@ -8,7 +8,6 @@ class MultiModalVisualAgent:
             agent: MultiModalVisualAgent
         ):
         self.agent = agent
-        self.plan = plan
     
     async def run(self, text: str) -> str:
         #run the multi-modal visual agent with the give task
diff --git a/swarms/tools/stt.py b/swarms/tools/stt.py
new file mode 100644
index 00000000..c442dd03
--- /dev/null
+++ b/swarms/tools/stt.py
@@ -0,0 +1,116 @@
+#speech to text
+
+import os
+from pydub import AudioSegment
+from pytube import YouTube
+import whisperx
+
+class SpeechToText:
+    def __init__(
+            self, 
+            video_url, 
+            audio_format='mp3',
+            device='cuda',
+            batch_size = 16,
+            compute_type = "float16",
+            hf_api_key = None
+        ):
+        """
+        # Example usage
+        video_url = "url"
+        speech_to_text = SpeechToText(video_url)
+        transcription = speech_to_text.transcribe_youtube_video()
+        print(transcription)
+
+        """
+        self.video_url = video_url
+        self.audio_format = audio_format
+        self.device = device
+        self.batch_size = batch_size
+        self.compute_type = compute_type
+        self.hf_api_key = hf_api_key
+    
+    def download_youtube_video(self):
+        audio_file = f'video.{self.audio_format}'
+        
+        # Download video 📥
+        yt = YouTube(self.video_url)
+        yt_stream = yt.streams.filter(only_audio=True).first()
+        yt_stream.download(filename='video.mp4')
+
+        # Convert video to audio 🎧
+        video = AudioSegment.from_file("video.mp4", format="mp4")
+        video.export(audio_file, format=self.audio_format)   
+        os.remove("video.mp4")
+        
+        return audio_file
+
+    def transcribe_youtube_video(self):
+        audio_file = self.download_youtube_video()
+        
+        device = "cuda"
+        batch_size = 16
+        compute_type = "float16"
+
+        # 1. Transcribe with original Whisper (batched) 🗣️
+        model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+        audio = whisperx.load_audio(audio_file)
+        result = model.transcribe(audio, batch_size=batch_size)
+
+        # 2. Align Whisper output 🔍
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+
+        # 3. Assign speaker labels 🏷️
+        diarize_model = whisperx.DiarizationPipeline(
+            use_auth_token=self.hf_api_key, 
+            device=device
+        )
+        diarize_segments = diarize_model(audio_file)
+        
+        try:
+            segments = result["segments"]
+            transcription = " ".join(segment['text'] for segment in segments)
+            return transcription
+        except KeyError:
+            print("The key 'segments' is not found in the result.")
+    
+    def transcribe(self, audio_file):
+        model = whisperx.load_model(
+            "large-v2", 
+            self.device, 
+            self.compute_type
+        )
+        audio = whisperx.load_audio(audio_file)
+        result = model.transcribe(
+            audio, 
+            batch_size=self.batch_size
+        )
+
+        # 2. Align Whisper output 🔍
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+        result = whisperx.align(
+            result["segments"], 
+            model_a, 
+            metadata, 
+            audio, 
+            self.device, 
+            return_char_alignments=False
+        )
+
+        # 3. Assign speaker labels 🏷️
+        diarize_model = whisperx.DiarizationPipeline(
+            use_auth_token=self.hf_api_key,
+            device=self.device
+        )
+
+        diarize_segments = diarize_model(audio_file)
+        
+        try:
+            segments = result["segments"]
+            transcription = " ".join(segment['text'] for segment in segments)
+            return transcription
+        except KeyError:
+            print("The key 'segments' is not found in the result.")
+
+