main 0.4.2
Kye 2 years ago
parent 0200ce7461
commit ac4ce99148

@ -1,30 +1,25 @@
transformers
openai
env
langchain
torch
torch==2.0.0
torchvision
asyncio
nest_asyncio
# faiss
bs4
playwright
duckduckgo_search
faiss-cpu
wget==3.2
accelerate
accelerate==0.17.1
addict
albumentations
basicsr
controlnet-aux
diffusers
diffusers==0.14.0
einops
gradio
imageio
imageio-ffmpeg
# GroundingDINO
invisible-watermark
git+https://github.com/facebookresearch/segment-anything.git
kornia
numpy
omegaconf
@ -39,7 +34,7 @@ torchmetrics
webdataset
yapf
wolframalpha
wikipedia
wikipedia==1.4.0
httpx
ggl
gradio_tools
@ -47,10 +42,27 @@ arxiv
google-api-python-client
google-auth-oauth
google-auth-httplib2
beautifulsoup4
beautifulsoup4==4.11.2
O365
# whisperx
pytube
pydub
git+https://github.com/m-bain/whisperx.git
llama-index
fastapi==0.94.1
pydantic==1.10.6
tenacity==8.2.2
python-dotenv==1.0.0
pillow==9.4.0
boto3==1.26.94
uvicorn==0.21.1
python-ptrace==0.9.8
jinja2==3.1.2
python-multipart==0.0.6
celery==5.2.7
redis==4.5.4
sentencepiece==0.1.97
bitsandbytes==0.37.2
psycopg2-binary==2.9.5
google-search-results==2.4.2
black==23.1.0

@ -4,7 +4,7 @@ from setuptools import setup, find_packages
setup(
name = 'swarms',
packages = find_packages(exclude=[]),
version = '0.4.1',
version = '0.4.2',
license='MIT',
description = 'Swarms - Pytorch',
author = 'Kye Gomez',
@ -31,6 +31,7 @@ setup(
"playwright",
"duckduckgo_search",
"faiss-cpu",
"python-ptrace==0.9.8",
"wget==3.2",
"accelerate",
"addict",

@ -2153,124 +2153,124 @@ router_toolkit = VectorStoreRouterToolkit(
############################################### ===========================> Whisperx speech to text
import os
from pydantic import BaseModel, Field
from pydub import AudioSegment
from pytube import YouTube
import whisperx
from langchain.tools import tool
hf_api_key = os.environ["HF_API_KEY"]
# define a custom input schema for the youtube url
class YouTubeVideoInput(BaseModel):
video_url: str = Field(description="YouTube Video URL to transcribe")
def download_youtube_video(video_url, audio_format='mp3'):
audio_file = f'video.{audio_format}'
# import os
# from pydantic import BaseModel, Field
# from pydub import AudioSegment
# from pytube import YouTube
# import whisperx
# from langchain.tools import tool
# Download video
yt = YouTube(video_url)
yt_stream = yt.streams.filter(only_audio=True).first()
yt_stream.download(filename='video.mp4')
# Convert video to audio
video = AudioSegment.from_file("video.mp4", format="mp4")
video.export(audio_file, format=audio_format)
os.remove("video.mp4")
# hf_api_key = os.environ["HF_API_KEY"]
# # define a custom input schema for the youtube url
# class YouTubeVideoInput(BaseModel):
# video_url: str = Field(description="YouTube Video URL to transcribe")
return audio_file
# def download_youtube_video(video_url, audio_format='mp3'):
# audio_file = f'video.{audio_format}'
@tool("transcribe_youtube_video", args_schema=YouTubeVideoInput, return_direct=True)
def transcribe_youtube_video(video_url: str) -> str:
"""Transcribes a YouTube video."""
audio_file = download_youtube_video(video_url)
# # Download video
# yt = YouTube(video_url)
# yt_stream = yt.streams.filter(only_audio=True).first()
# yt_stream.download(filename='video.mp4')
device = "cuda"
batch_size = 16
compute_type = "float16"
# # Convert video to audio
# video = AudioSegment.from_file("video.mp4", format="mp4")
# video.export(audio_file, format=audio_format)
# os.remove("video.mp4")
# 1. Transcribe with original Whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
# return audio_file
# 2. Align Whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# 3. Assign speaker labels
# @tool("transcribe_youtube_video", args_schema=YouTubeVideoInput, return_direct=True)
# def transcribe_youtube_video(video_url: str) -> str:
# """Transcribes a YouTube video."""
# audio_file = download_youtube_video(video_url)
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
diarize_segments = diarize_model(audio_file)
# device = "cuda"
# batch_size = 16
# compute_type = "float16"
try:
segments = result["segments"]
transcription = " ".join(segment['text'] for segment in segments)
return transcription
except KeyError:
print("The key 'segments' is not found in the result.")
# # 1. Transcribe with original Whisper (batched)
# model = whisperx.load_model("large-v2", device, compute_type=compute_type)
# audio = whisperx.load_audio(audio_file)
# result = model.transcribe(audio, batch_size=batch_size)
# # 2. Align Whisper output
# model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
# result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# # 3. Assign speaker labels
################################################### BASE WHISPER TOOL
from typing import Optional, Type
from pydantic import BaseModel, Field
from langchain.tools import BaseTool
from langchain.callbacks.manager import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
import requests
import whisperx
# diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
# diarize_segments = diarize_model(audio_file)
class AudioInput(BaseModel):
audio_file: str = Field(description="Path to audio file")
# try:
# segments = result["segments"]
# transcription = " ".join(segment['text'] for segment in segments)
# return transcription
# except KeyError:
# print("The key 'segments' is not found in the result.")
class TranscribeAudioTool(BaseTool):
name = "transcribe_audio"
description = "Transcribes an audio file using WhisperX"
args_schema: Type[AudioInput] = AudioInput
def _run(
self,
audio_file: str,
device: str = "cuda",
batch_size: int = 16,
compute_type: str = "float16",
run_manager: Optional[CallbackManagerForToolRun] = None,
) -> str:
"""Use the tool."""
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
diarize_segments = diarize_model(audio_file)
try:
segments = result["segments"]
transcription = " ".join(segment['text'] for segment in segments)
return transcription
except KeyError:
print("The key 'segments' is not found in the result.")
async def _arun(
self,
audio_file: str,
device: str = "cuda",
batch_size: int = 16,
compute_type: str = "float16",
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
) -> str:
"""Use the tool asynchronously."""
raise NotImplementedError("transcribe_audio does not support async")
# ################################################### BASE WHISPER TOOL
# from typing import Optional, Type
# from pydantic import BaseModel, Field
# from langchain.tools import BaseTool
# from langchain.callbacks.manager import (
# AsyncCallbackManagerForToolRun,
# CallbackManagerForToolRun,
# )
# import requests
# import whisperx
# class AudioInput(BaseModel):
# audio_file: str = Field(description="Path to audio file")
# class TranscribeAudioTool(BaseTool):
# name = "transcribe_audio"
# description = "Transcribes an audio file using WhisperX"
# args_schema: Type[AudioInput] = AudioInput
# def _run(
# self,
# audio_file: str,
# device: str = "cuda",
# batch_size: int = 16,
# compute_type: str = "float16",
# run_manager: Optional[CallbackManagerForToolRun] = None,
# ) -> str:
# """Use the tool."""
# model = whisperx.load_model("large-v2", device, compute_type=compute_type)
# audio = whisperx.load_audio(audio_file)
# result = model.transcribe(audio, batch_size=batch_size)
# model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
# result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
# diarize_segments = diarize_model(audio_file)
# try:
# segments = result["segments"]
# transcription = " ".join(segment['text'] for segment in segments)
# return transcription
# except KeyError:
# print("The key 'segments' is not found in the result.")
# async def _arun(
# self,
# audio_file: str,
# device: str = "cuda",
# batch_size: int = 16,
# compute_type: str = "float16",
# run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
# ) -> str:
# """Use the tool asynchronously."""
# raise NotImplementedError("transcribe_audio does not support async")

Loading…
Cancel
Save