clean up

2 years ago · ac4ce99148
parent 0200ce7461
commit ac4ce99148
4 changed files with 128 additions and 115 deletions
--- a/api/init.py
+++ b/api/init.py
--- a/requirements.txt
+++ b/requirements.txt
@ -1,30 +1,25 @@
 transformers
 openai
 env
 langchain
-torch
+torch==2.0.0
 torchvision
 asyncio
 nest_asyncio
 # faiss
 bs4
 playwright
 duckduckgo_search
 faiss-cpu
 wget==3.2
-accelerate
+accelerate==0.17.1
 addict
 albumentations
 basicsr
 controlnet-aux
-diffusers
+diffusers==0.14.0
 einops
 gradio
 imageio
 imageio-ffmpeg
 # GroundingDINO
 invisible-watermark
 git+https://github.com/facebookresearch/segment-anything.git
 kornia
 numpy
 omegaconf
@ -39,7 +34,7 @@ torchmetrics
 webdataset
 yapf
 wolframalpha
-wikipedia
+wikipedia==1.4.0
 httpx
 ggl
 gradio_tools
@ -47,10 +42,27 @@ arxiv
 google-api-python-client 
 google-auth-oauth
 google-auth-httplib2
-beautifulsoup4
+beautifulsoup4==4.11.2
 O365
 # whisperx
 pytube
 pydub
-git+https://github.com/m-bain/whisperx.git
+llama-index
-llama-index
+fastapi==0.94.1
 pydantic==1.10.6
 tenacity==8.2.2
 python-dotenv==1.0.0
 pillow==9.4.0
 boto3==1.26.94
 uvicorn==0.21.1
 python-ptrace==0.9.8
 jinja2==3.1.2
 python-multipart==0.0.6
 celery==5.2.7
 redis==4.5.4
 sentencepiece==0.1.97
 bitsandbytes==0.37.2
 psycopg2-binary==2.9.5
 google-search-results==2.4.2
 black==23.1.0
--- a/setup.py
+++ b/setup.py
@ -4,7 +4,7 @@ from setuptools import setup, find_packages
 setup(
  name = 'swarms',
  packages = find_packages(exclude=[]),
-  version = '0.4.1',
+  version = '0.4.2',
  license='MIT',
  description = 'Swarms - Pytorch',
  author = 'Kye Gomez',
@ -31,6 +31,7 @@ setup(
        "playwright",
        "duckduckgo_search",
        "faiss-cpu",
        "python-ptrace==0.9.8",
        "wget==3.2",
        "accelerate",
        "addict",
--- a/swarms/tools/main.py
+++ b/swarms/tools/main.py
@ -2153,124 +2153,124 @@ router_toolkit = VectorStoreRouterToolkit(
 ############################################### ===========================> Whisperx speech to text
-import os
+# import os
-from pydantic import BaseModel, Field
+# from pydantic import BaseModel, Field
-from pydub import AudioSegment
+# from pydub import AudioSegment
-from pytube import YouTube
+# from pytube import YouTube
-import whisperx
+# import whisperx
-from langchain.tools import tool
+# from langchain.tools import tool
-hf_api_key = os.environ["HF_API_KEY"]
+# hf_api_key = os.environ["HF_API_KEY"]
-# define a custom input schema for the youtube url
+# # define a custom input schema for the youtube url
-class YouTubeVideoInput(BaseModel):
+# class YouTubeVideoInput(BaseModel):
-    video_url: str = Field(description="YouTube Video URL to transcribe")
+#     video_url: str = Field(description="YouTube Video URL to transcribe")
-def download_youtube_video(video_url, audio_format='mp3'):
+# def download_youtube_video(video_url, audio_format='mp3'):
-    audio_file = f'video.{audio_format}'
+#     audio_file = f'video.{audio_format}'
-    # Download video
+#     # Download video
-    yt = YouTube(video_url)
+#     yt = YouTube(video_url)
-    yt_stream = yt.streams.filter(only_audio=True).first()
+#     yt_stream = yt.streams.filter(only_audio=True).first()
-    yt_stream.download(filename='video.mp4')
+#     yt_stream.download(filename='video.mp4')
-
+
-    # Convert video to audio
+#     # Convert video to audio
-    video = AudioSegment.from_file("video.mp4", format="mp4")
+#     video = AudioSegment.from_file("video.mp4", format="mp4")
-    video.export(audio_file, format=audio_format)
+#     video.export(audio_file, format=audio_format)
-    os.remove("video.mp4")
+#     os.remove("video.mp4")
-    return audio_file
+#     return audio_file
-@tool("transcribe_youtube_video", args_schema=YouTubeVideoInput, return_direct=True)
+# @tool("transcribe_youtube_video", args_schema=YouTubeVideoInput, return_direct=True)
-def transcribe_youtube_video(video_url: str) -> str:
+# def transcribe_youtube_video(video_url: str) -> str:
-    """Transcribes a YouTube video."""
+#     """Transcribes a YouTube video."""
-    audio_file = download_youtube_video(video_url)
+#     audio_file = download_youtube_video(video_url)
-    device = "cuda"
+#     device = "cuda"
-    batch_size = 16
+#     batch_size = 16
-    compute_type = "float16"
+#     compute_type = "float16"
-    # 1. Transcribe with original Whisper (batched)
+#     # 1. Transcribe with original Whisper (batched)
-    model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+#     model = whisperx.load_model("large-v2", device, compute_type=compute_type)
-    audio = whisperx.load_audio(audio_file)
+#     audio = whisperx.load_audio(audio_file)
-    result = model.transcribe(audio, batch_size=batch_size)
+#     result = model.transcribe(audio, batch_size=batch_size)
-    # 2. Align Whisper output
+#     # 2. Align Whisper output
-    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+#     model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+#     result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
-    # 3. Assign speaker labels
+#     # 3. Assign speaker labels
-    diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
+#     diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
-    diarize_segments = diarize_model(audio_file)
+#     diarize_segments = diarize_model(audio_file)
-    try:
+#     try:
-      segments = result["segments"]
+#       segments = result["segments"]
-      transcription = " ".join(segment['text'] for segment in segments)
+#       transcription = " ".join(segment['text'] for segment in segments)
-      return transcription
+#       return transcription
-    except KeyError:
+#     except KeyError:
-      print("The key 'segments' is not found in the result.")
+#       print("The key 'segments' is not found in the result.")
-
+
-
+
-
+
-################################################### BASE WHISPER TOOL
+# ################################################### BASE WHISPER TOOL
-from typing import Optional, Type
+# from typing import Optional, Type
-from pydantic import BaseModel, Field
+# from pydantic import BaseModel, Field
-from langchain.tools import BaseTool
+# from langchain.tools import BaseTool
-from langchain.callbacks.manager import (
+# from langchain.callbacks.manager import (
-    AsyncCallbackManagerForToolRun,
+#     AsyncCallbackManagerForToolRun,
-    CallbackManagerForToolRun,
+#     CallbackManagerForToolRun,
-)
+# )
-import requests
+# import requests
-import whisperx
+# import whisperx
-
+
-class AudioInput(BaseModel):
+# class AudioInput(BaseModel):
-    audio_file: str = Field(description="Path to audio file")
+#     audio_file: str = Field(description="Path to audio file")
-
+
-
+
-class TranscribeAudioTool(BaseTool):
+# class TranscribeAudioTool(BaseTool):
-    name = "transcribe_audio"
+#     name = "transcribe_audio"
-    description = "Transcribes an audio file using WhisperX"
+#     description = "Transcribes an audio file using WhisperX"
-    args_schema: Type[AudioInput] = AudioInput
+#     args_schema: Type[AudioInput] = AudioInput
-
+
-    def _run(
+#     def _run(
-        self,
+#         self,
-        audio_file: str,
+#         audio_file: str,
-        device: str = "cuda",
+#         device: str = "cuda",
-        batch_size: int = 16,
+#         batch_size: int = 16,
-        compute_type: str = "float16",
+#         compute_type: str = "float16",
-        run_manager: Optional[CallbackManagerForToolRun] = None,
+#         run_manager: Optional[CallbackManagerForToolRun] = None,
-    ) -> str:
+#     ) -> str:
-        """Use the tool."""
+#         """Use the tool."""
-        model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+#         model = whisperx.load_model("large-v2", device, compute_type=compute_type)
-        audio = whisperx.load_audio(audio_file)
+#         audio = whisperx.load_audio(audio_file)
-        result = model.transcribe(audio, batch_size=batch_size)
+#         result = model.transcribe(audio, batch_size=batch_size)
-        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+#         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+#         result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
-        diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
+#         diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
-        diarize_segments = diarize_model(audio_file)
+#         diarize_segments = diarize_model(audio_file)
-        try:
+#         try:
-            segments = result["segments"]
+#             segments = result["segments"]
-            transcription = " ".join(segment['text'] for segment in segments)
+#             transcription = " ".join(segment['text'] for segment in segments)
-            return transcription
+#             return transcription
-        except KeyError:
+#         except KeyError:
-            print("The key 'segments' is not found in the result.")
+#             print("The key 'segments' is not found in the result.")
-
+
-    async def _arun(
+#     async def _arun(
-        self,
+#         self,
-        audio_file: str,
+#         audio_file: str,
-        device: str = "cuda",
+#         device: str = "cuda",
-        batch_size: int = 16,
+#         batch_size: int = 16,
-        compute_type: str = "float16",
+#         compute_type: str = "float16",
-        run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
+#         run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
-    ) -> str:
+#     ) -> str:
-        """Use the tool asynchronously."""
+#         """Use the tool asynchronously."""
-        raise NotImplementedError("transcribe_audio does not support async")
+#         raise NotImplementedError("transcribe_audio does not support async")