main 0.4.2
Kye 2 years ago
parent 0200ce7461
commit ac4ce99148

@ -1,30 +1,25 @@
transformers transformers
openai openai
env
langchain langchain
torch torch==2.0.0
torchvision torchvision
asyncio asyncio
nest_asyncio nest_asyncio
# faiss
bs4 bs4
playwright playwright
duckduckgo_search duckduckgo_search
faiss-cpu faiss-cpu
wget==3.2 wget==3.2
accelerate accelerate==0.17.1
addict addict
albumentations albumentations
basicsr basicsr
controlnet-aux controlnet-aux
diffusers diffusers==0.14.0
einops einops
gradio gradio
imageio imageio
imageio-ffmpeg imageio-ffmpeg
# GroundingDINO
invisible-watermark
git+https://github.com/facebookresearch/segment-anything.git
kornia kornia
numpy numpy
omegaconf omegaconf
@ -39,7 +34,7 @@ torchmetrics
webdataset webdataset
yapf yapf
wolframalpha wolframalpha
wikipedia wikipedia==1.4.0
httpx httpx
ggl ggl
gradio_tools gradio_tools
@ -47,10 +42,27 @@ arxiv
google-api-python-client google-api-python-client
google-auth-oauth google-auth-oauth
google-auth-httplib2 google-auth-httplib2
beautifulsoup4 beautifulsoup4==4.11.2
O365 O365
# whisperx
pytube pytube
pydub pydub
git+https://github.com/m-bain/whisperx.git llama-index
llama-index fastapi==0.94.1
pydantic==1.10.6
tenacity==8.2.2
python-dotenv==1.0.0
pillow==9.4.0
boto3==1.26.94
uvicorn==0.21.1
python-ptrace==0.9.8
jinja2==3.1.2
python-multipart==0.0.6
celery==5.2.7
redis==4.5.4
sentencepiece==0.1.97
bitsandbytes==0.37.2
psycopg2-binary==2.9.5
google-search-results==2.4.2
black==23.1.0

@ -4,7 +4,7 @@ from setuptools import setup, find_packages
setup( setup(
name = 'swarms', name = 'swarms',
packages = find_packages(exclude=[]), packages = find_packages(exclude=[]),
version = '0.4.1', version = '0.4.2',
license='MIT', license='MIT',
description = 'Swarms - Pytorch', description = 'Swarms - Pytorch',
author = 'Kye Gomez', author = 'Kye Gomez',
@ -31,6 +31,7 @@ setup(
"playwright", "playwright",
"duckduckgo_search", "duckduckgo_search",
"faiss-cpu", "faiss-cpu",
"python-ptrace==0.9.8",
"wget==3.2", "wget==3.2",
"accelerate", "accelerate",
"addict", "addict",

@ -2153,124 +2153,124 @@ router_toolkit = VectorStoreRouterToolkit(
############################################### ===========================> Whisperx speech to text ############################################### ===========================> Whisperx speech to text
import os # import os
from pydantic import BaseModel, Field # from pydantic import BaseModel, Field
from pydub import AudioSegment # from pydub import AudioSegment
from pytube import YouTube # from pytube import YouTube
import whisperx # import whisperx
from langchain.tools import tool # from langchain.tools import tool
hf_api_key = os.environ["HF_API_KEY"] # hf_api_key = os.environ["HF_API_KEY"]
# define a custom input schema for the youtube url # # define a custom input schema for the youtube url
class YouTubeVideoInput(BaseModel): # class YouTubeVideoInput(BaseModel):
video_url: str = Field(description="YouTube Video URL to transcribe") # video_url: str = Field(description="YouTube Video URL to transcribe")
def download_youtube_video(video_url, audio_format='mp3'): # def download_youtube_video(video_url, audio_format='mp3'):
audio_file = f'video.{audio_format}' # audio_file = f'video.{audio_format}'
# Download video # # Download video
yt = YouTube(video_url) # yt = YouTube(video_url)
yt_stream = yt.streams.filter(only_audio=True).first() # yt_stream = yt.streams.filter(only_audio=True).first()
yt_stream.download(filename='video.mp4') # yt_stream.download(filename='video.mp4')
# Convert video to audio # # Convert video to audio
video = AudioSegment.from_file("video.mp4", format="mp4") # video = AudioSegment.from_file("video.mp4", format="mp4")
video.export(audio_file, format=audio_format) # video.export(audio_file, format=audio_format)
os.remove("video.mp4") # os.remove("video.mp4")
return audio_file # return audio_file
@tool("transcribe_youtube_video", args_schema=YouTubeVideoInput, return_direct=True) # @tool("transcribe_youtube_video", args_schema=YouTubeVideoInput, return_direct=True)
def transcribe_youtube_video(video_url: str) -> str: # def transcribe_youtube_video(video_url: str) -> str:
"""Transcribes a YouTube video.""" # """Transcribes a YouTube video."""
audio_file = download_youtube_video(video_url) # audio_file = download_youtube_video(video_url)
device = "cuda" # device = "cuda"
batch_size = 16 # batch_size = 16
compute_type = "float16" # compute_type = "float16"
# 1. Transcribe with original Whisper (batched) # # 1. Transcribe with original Whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type) # model = whisperx.load_model("large-v2", device, compute_type=compute_type)
audio = whisperx.load_audio(audio_file) # audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size) # result = model.transcribe(audio, batch_size=batch_size)
# 2. Align Whisper output # # 2. Align Whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) # model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) # result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
# 3. Assign speaker labels # # 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device) # diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
diarize_segments = diarize_model(audio_file) # diarize_segments = diarize_model(audio_file)
try: # try:
segments = result["segments"] # segments = result["segments"]
transcription = " ".join(segment['text'] for segment in segments) # transcription = " ".join(segment['text'] for segment in segments)
return transcription # return transcription
except KeyError: # except KeyError:
print("The key 'segments' is not found in the result.") # print("The key 'segments' is not found in the result.")
################################################### BASE WHISPER TOOL # ################################################### BASE WHISPER TOOL
from typing import Optional, Type # from typing import Optional, Type
from pydantic import BaseModel, Field # from pydantic import BaseModel, Field
from langchain.tools import BaseTool # from langchain.tools import BaseTool
from langchain.callbacks.manager import ( # from langchain.callbacks.manager import (
AsyncCallbackManagerForToolRun, # AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun, # CallbackManagerForToolRun,
) # )
import requests # import requests
import whisperx # import whisperx
class AudioInput(BaseModel): # class AudioInput(BaseModel):
audio_file: str = Field(description="Path to audio file") # audio_file: str = Field(description="Path to audio file")
class TranscribeAudioTool(BaseTool): # class TranscribeAudioTool(BaseTool):
name = "transcribe_audio" # name = "transcribe_audio"
description = "Transcribes an audio file using WhisperX" # description = "Transcribes an audio file using WhisperX"
args_schema: Type[AudioInput] = AudioInput # args_schema: Type[AudioInput] = AudioInput
def _run( # def _run(
self, # self,
audio_file: str, # audio_file: str,
device: str = "cuda", # device: str = "cuda",
batch_size: int = 16, # batch_size: int = 16,
compute_type: str = "float16", # compute_type: str = "float16",
run_manager: Optional[CallbackManagerForToolRun] = None, # run_manager: Optional[CallbackManagerForToolRun] = None,
) -> str: # ) -> str:
"""Use the tool.""" # """Use the tool."""
model = whisperx.load_model("large-v2", device, compute_type=compute_type) # model = whisperx.load_model("large-v2", device, compute_type=compute_type)
audio = whisperx.load_audio(audio_file) # audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size) # result = model.transcribe(audio, batch_size=batch_size)
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) # model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) # result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device) # diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
diarize_segments = diarize_model(audio_file) # diarize_segments = diarize_model(audio_file)
try: # try:
segments = result["segments"] # segments = result["segments"]
transcription = " ".join(segment['text'] for segment in segments) # transcription = " ".join(segment['text'] for segment in segments)
return transcription # return transcription
except KeyError: # except KeyError:
print("The key 'segments' is not found in the result.") # print("The key 'segments' is not found in the result.")
async def _arun( # async def _arun(
self, # self,
audio_file: str, # audio_file: str,
device: str = "cuda", # device: str = "cuda",
batch_size: int = 16, # batch_size: int = 16,
compute_type: str = "float16", # compute_type: str = "float16",
run_manager: Optional[AsyncCallbackManagerForToolRun] = None, # run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
) -> str: # ) -> str:
"""Use the tool asynchronously.""" # """Use the tool asynchronously."""
raise NotImplementedError("transcribe_audio does not support async") # raise NotImplementedError("transcribe_audio does not support async")

Loading…
Cancel
Save