main 0.4.2
Kye 2 years ago
parent 0200ce7461
commit ac4ce99148

@ -1,30 +1,25 @@
transformers transformers
openai openai
env
langchain langchain
torch torch==2.0.0
torchvision torchvision
asyncio asyncio
nest_asyncio nest_asyncio
# faiss
bs4 bs4
playwright playwright
duckduckgo_search duckduckgo_search
faiss-cpu faiss-cpu
wget==3.2 wget==3.2
accelerate accelerate==0.17.1
addict addict
albumentations albumentations
basicsr basicsr
controlnet-aux controlnet-aux
diffusers diffusers==0.14.0
einops einops
gradio gradio
imageio imageio
imageio-ffmpeg imageio-ffmpeg
# GroundingDINO
invisible-watermark
git+https://github.com/facebookresearch/segment-anything.git
kornia kornia
numpy numpy
omegaconf omegaconf
@ -39,7 +34,7 @@ torchmetrics
webdataset webdataset
yapf yapf
wolframalpha wolframalpha
wikipedia wikipedia==1.4.0
httpx httpx
ggl ggl
gradio_tools gradio_tools
@ -47,10 +42,27 @@ arxiv
google-api-python-client google-api-python-client
google-auth-oauth google-auth-oauth
google-auth-httplib2 google-auth-httplib2
beautifulsoup4 beautifulsoup4==4.11.2
O365 O365
# whisperx
pytube pytube
pydub pydub
git+https://github.com/m-bain/whisperx.git
llama-index llama-index
fastapi==0.94.1
pydantic==1.10.6
tenacity==8.2.2
python-dotenv==1.0.0
pillow==9.4.0
boto3==1.26.94
uvicorn==0.21.1
python-ptrace==0.9.8
jinja2==3.1.2
python-multipart==0.0.6
celery==5.2.7
redis==4.5.4
sentencepiece==0.1.97
bitsandbytes==0.37.2
psycopg2-binary==2.9.5
google-search-results==2.4.2
black==23.1.0

@ -4,7 +4,7 @@ from setuptools import setup, find_packages
setup( setup(
name = 'swarms', name = 'swarms',
packages = find_packages(exclude=[]), packages = find_packages(exclude=[]),
version = '0.4.1', version = '0.4.2',
license='MIT', license='MIT',
description = 'Swarms - Pytorch', description = 'Swarms - Pytorch',
author = 'Kye Gomez', author = 'Kye Gomez',
@ -31,6 +31,7 @@ setup(
"playwright", "playwright",
"duckduckgo_search", "duckduckgo_search",
"faiss-cpu", "faiss-cpu",
"python-ptrace==0.9.8",
"wget==3.2", "wget==3.2",
"accelerate", "accelerate",
"addict", "addict",

@ -2153,124 +2153,124 @@ router_toolkit = VectorStoreRouterToolkit(
############################################### ===========================> Whisperx speech to text ############################################### ===========================> Whisperx speech to text
import os # import os
from pydantic import BaseModel, Field # from pydantic import BaseModel, Field
from pydub import AudioSegment # from pydub import AudioSegment
from pytube import YouTube # from pytube import YouTube
import whisperx # import whisperx
from langchain.tools import tool # from langchain.tools import tool
hf_api_key = os.environ["HF_API_KEY"]
# define a custom input schema for the youtube url
class YouTubeVideoInput(BaseModel):
video_url: str = Field(description="YouTube Video URL to transcribe")
def download_youtube_video(video_url, audio_format='mp3'):
audio_file = f'video.{audio_format}'
# Download video
yt = YouTube(video_url)
yt_stream = yt.streams.filter(only_audio=True).first()
yt_stream.download(filename='video.mp4')
# Convert video to audio # hf_api_key = os.environ["HF_API_KEY"]
video = AudioSegment.from_file("video.mp4", format="mp4") # # define a custom input schema for the youtube url
video.export(audio_file, format=audio_format) # class YouTubeVideoInput(BaseModel):
os.remove("video.mp4") # video_url: str = Field(description="YouTube Video URL to transcribe")
return audio_file
# def download_youtube_video(video_url, audio_format='mp3'):
# audio_file = f'video.{audio_format}'
@tool("transcribe_youtube_video", args_schema=YouTubeVideoInput, return_direct=True) # # Download video
def transcribe_youtube_video(video_url: str) -> str: # yt = YouTube(video_url)
"""Transcribes a YouTube video.""" # yt_stream = yt.streams.filter(only_audio=True).first()
audio_file = download_youtube_video(video_url) # yt_stream.download(filename='video.mp4')
device = "cuda" # # Convert video to audio
batch_size = 16 # video = AudioSegment.from_file("video.mp4", format="mp4")
compute_type = "float16" # video.export(audio_file, format=audio_format)
# os.remove("video.mp4")
# 1. Transcribe with original Whisper (batched) # return audio_file
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
# 2. Align Whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# 3. Assign speaker labels # @tool("transcribe_youtube_video", args_schema=YouTubeVideoInput, return_direct=True)
# def transcribe_youtube_video(video_url: str) -> str:
# """Transcribes a YouTube video."""
# audio_file = download_youtube_video(video_url)
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device) # device = "cuda"
diarize_segments = diarize_model(audio_file) # batch_size = 16
# compute_type = "float16"
try: # # 1. Transcribe with original Whisper (batched)
segments = result["segments"] # model = whisperx.load_model("large-v2", device, compute_type=compute_type)
transcription = " ".join(segment['text'] for segment in segments) # audio = whisperx.load_audio(audio_file)
return transcription # result = model.transcribe(audio, batch_size=batch_size)
except KeyError:
print("The key 'segments' is not found in the result.")
# # 2. Align Whisper output
# model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
# result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# # 3. Assign speaker labels
################################################### BASE WHISPER TOOL # diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
from typing import Optional, Type # diarize_segments = diarize_model(audio_file)
from pydantic import BaseModel, Field
from langchain.tools import BaseTool
from langchain.callbacks.manager import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
import requests
import whisperx
class AudioInput(BaseModel): # try:
audio_file: str = Field(description="Path to audio file") # segments = result["segments"]
# transcription = " ".join(segment['text'] for segment in segments)
# return transcription
# except KeyError:
# print("The key 'segments' is not found in the result.")
class TranscribeAudioTool(BaseTool):
name = "transcribe_audio"
description = "Transcribes an audio file using WhisperX"
args_schema: Type[AudioInput] = AudioInput
def _run( # ################################################### BASE WHISPER TOOL
self, # from typing import Optional, Type
audio_file: str, # from pydantic import BaseModel, Field
device: str = "cuda", # from langchain.tools import BaseTool
batch_size: int = 16, # from langchain.callbacks.manager import (
compute_type: str = "float16", # AsyncCallbackManagerForToolRun,
run_manager: Optional[CallbackManagerForToolRun] = None, # CallbackManagerForToolRun,
) -> str: # )
"""Use the tool.""" # import requests
model = whisperx.load_model("large-v2", device, compute_type=compute_type) # import whisperx
audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size) # class AudioInput(BaseModel):
# audio_file: str = Field(description="Path to audio file")
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# class TranscribeAudioTool(BaseTool):
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device) # name = "transcribe_audio"
diarize_segments = diarize_model(audio_file) # description = "Transcribes an audio file using WhisperX"
# args_schema: Type[AudioInput] = AudioInput
try:
segments = result["segments"] # def _run(
transcription = " ".join(segment['text'] for segment in segments) # self,
return transcription # audio_file: str,
except KeyError: # device: str = "cuda",
print("The key 'segments' is not found in the result.") # batch_size: int = 16,
# compute_type: str = "float16",
async def _arun( # run_manager: Optional[CallbackManagerForToolRun] = None,
self, # ) -> str:
audio_file: str, # """Use the tool."""
device: str = "cuda", # model = whisperx.load_model("large-v2", device, compute_type=compute_type)
batch_size: int = 16, # audio = whisperx.load_audio(audio_file)
compute_type: str = "float16", # result = model.transcribe(audio, batch_size=batch_size)
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
) -> str: # model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
"""Use the tool asynchronously.""" # result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
raise NotImplementedError("transcribe_audio does not support async")
# diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
# diarize_segments = diarize_model(audio_file)
# try:
# segments = result["segments"]
# transcription = " ".join(segment['text'] for segment in segments)
# return transcription
# except KeyError:
# print("The key 'segments' is not found in the result.")
# async def _arun(
# self,
# audio_file: str,
# device: str = "cuda",
# batch_size: int = 16,
# compute_type: str = "float16",
# run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
# ) -> str:
# """Use the tool asynchronously."""
# raise NotImplementedError("transcribe_audio does not support async")

Loading…
Cancel
Save