parent
1897fa1d1f
commit
5df1fb884d
@ -0,0 +1,116 @@
|
||||
#speech to text
|
||||
|
||||
import os
|
||||
from pydub import AudioSegment
|
||||
from pytube import YouTube
|
||||
import whisperx
|
||||
|
||||
class SpeechToText:
|
||||
def __init__(
|
||||
self,
|
||||
video_url,
|
||||
audio_format='mp3',
|
||||
device='cuda',
|
||||
batch_size = 16,
|
||||
compute_type = "float16",
|
||||
hf_api_key = None
|
||||
):
|
||||
"""
|
||||
# Example usage
|
||||
video_url = "url"
|
||||
speech_to_text = SpeechToText(video_url)
|
||||
transcription = speech_to_text.transcribe_youtube_video()
|
||||
print(transcription)
|
||||
|
||||
"""
|
||||
self.video_url = video_url
|
||||
self.audio_format = audio_format
|
||||
self.device = device
|
||||
self.batch_size = batch_size
|
||||
self.compute_type = compute_type
|
||||
self.hf_api_key = hf_api_key
|
||||
|
||||
def download_youtube_video(self):
|
||||
audio_file = f'video.{self.audio_format}'
|
||||
|
||||
# Download video 📥
|
||||
yt = YouTube(self.video_url)
|
||||
yt_stream = yt.streams.filter(only_audio=True).first()
|
||||
yt_stream.download(filename='video.mp4')
|
||||
|
||||
# Convert video to audio 🎧
|
||||
video = AudioSegment.from_file("video.mp4", format="mp4")
|
||||
video.export(audio_file, format=self.audio_format)
|
||||
os.remove("video.mp4")
|
||||
|
||||
return audio_file
|
||||
|
||||
def transcribe_youtube_video(self):
|
||||
audio_file = self.download_youtube_video()
|
||||
|
||||
device = "cuda"
|
||||
batch_size = 16
|
||||
compute_type = "float16"
|
||||
|
||||
# 1. Transcribe with original Whisper (batched) 🗣️
|
||||
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
|
||||
audio = whisperx.load_audio(audio_file)
|
||||
result = model.transcribe(audio, batch_size=batch_size)
|
||||
|
||||
# 2. Align Whisper output 🔍
|
||||
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
||||
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
|
||||
|
||||
# 3. Assign speaker labels 🏷️
|
||||
diarize_model = whisperx.DiarizationPipeline(
|
||||
use_auth_token=self.hf_api_key,
|
||||
device=device
|
||||
)
|
||||
diarize_segments = diarize_model(audio_file)
|
||||
|
||||
try:
|
||||
segments = result["segments"]
|
||||
transcription = " ".join(segment['text'] for segment in segments)
|
||||
return transcription
|
||||
except KeyError:
|
||||
print("The key 'segments' is not found in the result.")
|
||||
|
||||
def transcribe(self, audio_file):
|
||||
model = whisperx.load_model(
|
||||
"large-v2",
|
||||
self.device,
|
||||
self.compute_type
|
||||
)
|
||||
audio = whisperx.load_audio(audio_file)
|
||||
result = model.transcribe(
|
||||
audio,
|
||||
batch_size=self.batch_size
|
||||
)
|
||||
|
||||
# 2. Align Whisper output 🔍
|
||||
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
||||
result = whisperx.align(
|
||||
result["segments"],
|
||||
model_a,
|
||||
metadata,
|
||||
audio,
|
||||
self.device,
|
||||
return_char_alignments=False
|
||||
)
|
||||
|
||||
# 3. Assign speaker labels 🏷️
|
||||
diarize_model = whisperx.DiarizationPipeline(
|
||||
use_auth_token=self.hf_api_key,
|
||||
device=self.device
|
||||
)
|
||||
|
||||
diarize_segments = diarize_model(audio_file)
|
||||
|
||||
try:
|
||||
segments = result["segments"]
|
||||
transcription = " ".join(segment['text'] for segment in segments)
|
||||
return transcription
|
||||
except KeyError:
|
||||
print("The key 'segments' is not found in the result.")
|
||||
|
||||
|
Loading…
Reference in new issue