From d454733dff57d76a75b148c3d2e1493cb2fff9c4 Mon Sep 17 00:00:00 2001 From: Kye Date: Mon, 25 Dec 2023 21:27:47 -0500 Subject: [PATCH] [FEAT][ZeroscopeTTV] [BaseTextToVideo] --- README.md | 60 +++++++++++++++- docs/swarms/models/zeroscope.md | 105 +++++++++++++++++++++++++++ mkdocs.yml | 1 + pyproject.toml | 2 +- swarms/models/__init__.py | 6 +- swarms/models/base_ttv.py | 115 ++++++++++++++++++++++++++++++ swarms/models/dalle3.py | 2 - swarms/models/zeroscope.py | 103 +++++++++++++++++++++++++++ tests/models/test_zeroscope.py | 122 ++++++++++++++++++++++++++++++++ 9 files changed, 510 insertions(+), 6 deletions(-) create mode 100644 docs/swarms/models/zeroscope.md create mode 100644 swarms/models/base_ttv.py create mode 100644 swarms/models/zeroscope.py create mode 100644 tests/models/test_zeroscope.py diff --git a/README.md b/README.md index 4bf0fc06..958be9fc 100644 --- a/README.md +++ b/README.md @@ -313,7 +313,7 @@ efficiency_analysis = efficiency_agent.run( ) ``` -### Gemini +### `Gemini` - Deploy Gemini from Google with utmost reliability with our visual chain of thought prompt that enables more reliable responses ```python import os @@ -386,7 +386,7 @@ generated_text = inference(prompt_text) print(generated_text) ``` -### Mixtral +### `Mixtral` - Utilize Mixtral in a very simple API, - Utilize 4bit quantization for a increased speed and less memory usage - Use Flash Attention 2.0 for increased speed and less memory usage @@ -403,6 +403,62 @@ generated_text = mixtral.run("Generate a creative story.") print(generated_text) ``` + +### `Dalle3` +```python +from swarms import Dalle3 + +# Create an instance of the Dalle3 class with high quality +dalle3 = Dalle3(quality="high") + +# Define a text prompt +task = "A high-quality image of a sunset" + +# Generate a high-quality image from the text prompt +image_url = dalle3(task) + +# Print the generated image URL +print(image_url) +``` + + +### `GPT4Vision` +```python +from swarms.models import GPT4VisionAPI + +# Initialize with default API key and custom max_tokens +api = GPT4VisionAPI(max_tokens=1000) + +# Define the task and image URL +task = "Describe the scene in the image." +img = "https://i.imgur.com/4P4ZRxU.jpeg" + +# Run the GPT-4 Vision model +response = api.run(task, img) + +# Print the model's response +print(response) +``` + + +### Text to Video with `ZeroscopeTTV` + +```python +# Import the model +from swarms import ZeroscopeTTV + +# Initialize the model +zeroscope = ZeroscopeTTV() + +# Specify the task +task = "A person is walking on the street." + +# Generate the video! +video_path = zeroscope(task) +print(video_path) + +``` + --- # Features 🤖 diff --git a/docs/swarms/models/zeroscope.md b/docs/swarms/models/zeroscope.md new file mode 100644 index 00000000..4e634a6a --- /dev/null +++ b/docs/swarms/models/zeroscope.md @@ -0,0 +1,105 @@ +# Module Name: ZeroscopeTTV + +## Introduction +The ZeroscopeTTV module is a versatile zero-shot video generation model designed to create videos based on textual descriptions. This comprehensive documentation will provide you with an in-depth understanding of the ZeroscopeTTV module, its architecture, purpose, arguments, and detailed usage examples. + +## Purpose +The ZeroscopeTTV module serves as a powerful tool for generating videos from text descriptions. Whether you need to create video content for various applications, visualize textual data, or explore the capabilities of ZeroscopeTTV, this module offers a flexible and efficient solution. With its easy-to-use interface, you can quickly generate videos based on your textual input. + +## Architecture +The ZeroscopeTTV module is built on top of the Diffusers library, leveraging the power of diffusion models for video generation. It allows you to specify various parameters such as model name, data type, chunk size, dimensions, and more to customize the video generation process. The model performs multiple inference steps and utilizes a diffusion pipeline to generate high-quality videos. + +## Class Definition +### `ZeroscopeTTV(model_name: str = "cerspense/zeroscope_v2_576w", torch_dtype=torch.float16, chunk_size: int = 1, dim: int = 1, num_inference_steps: int = 40, height: int = 320, width: int = 576, num_frames: int = 36)` + +#### Parameters +- `model_name` (str, optional): The name of the pre-trained model to use. Default is "cerspense/zeroscope_v2_576w". +- `torch_dtype` (torch.dtype, optional): The torch data type to use for computations. Default is torch.float16. +- `chunk_size` (int, optional): The size of chunks for forward chunking. Default is 1. +- `dim` (int, optional): The dimension along which the input is split for forward chunking. Default is 1. +- `num_inference_steps` (int, optional): The number of inference steps to perform. Default is 40. +- `height` (int, optional): The height of the video frames. Default is 320. +- `width` (int, optional): The width of the video frames. Default is 576. +- `num_frames` (int, optional): The number of frames in the video. Default is 36. + +## Functionality and Usage +The ZeroscopeTTV module offers a straightforward interface for video generation. It accepts a textual task or description as input and returns the path to the generated video. + +### `run(task: str = None, *args, **kwargs) -> str` + +#### Parameters +- `task` (str, optional): The input task or description for video generation. + +#### Returns +- `str`: The path to the generated video. + +## Usage Examples +### Example 1: Basic Usage + +```python +from swarms.models import ZeroscopeTTV + +# Initialize the ZeroscopeTTV model +zeroscope = ZeroscopeTTV() + +# Generate a video based on a textual description +task = "A bird flying in the sky." +video_path = zeroscope.run(task) +print(f"Generated video path: {video_path}") +``` + +### Example 2: Custom Model and Parameters + +You can specify a custom pre-trained model and adjust various parameters for video generation. + +```python +custom_model_name = "your_custom_model_path" +custom_dtype = torch.float32 +custom_chunk_size = 2 +custom_dim = 2 +custom_num_inference_steps = 50 +custom_height = 480 +custom_width = 720 +custom_num_frames = 48 + +custom_zeroscope = ZeroscopeTTV( + model_name=custom_model_name, + torch_dtype=custom_dtype, + chunk_size=custom_chunk_size, + dim=custom_dim, + num_inference_steps=custom_num_inference_steps, + height=custom_height, + width=custom_width, + num_frames=custom_num_frames, +) + +task = "A car driving on the road." +video_path = custom_zeroscope.run(task) +print(f"Generated video path: {video_path}") +``` + +### Example 3: Exporting Video Frames + +You can also export individual video frames if needed. + +```python +from swarms.models import export_to_video + +# Generate video frames +video_frames = zeroscope.run("A boat sailing on the water.") + +# Export video frames to a video file +video_path = export_to_video(video_frames) +print(f"Generated video path: {video_path}") +``` + +## Additional Information and Tips +- Ensure that the input textual task or description is clear and descriptive to achieve the desired video output. +- Experiment with different parameter settings to control video resolution, frame count, and inference steps. +- Use the `export_to_video` function to export individual video frames as needed. +- Monitor the progress and output paths to access the generated videos. + +## Conclusion +The ZeroscopeTTV module is a powerful solution for zero-shot video generation based on textual descriptions. Whether you are creating videos for storytelling, data visualization, or other applications, ZeroscopeTTV offers a versatile and efficient way to bring your text to life. With a flexible interface and customizable parameters, it empowers you to generate high-quality videos with ease. + +If you encounter any issues or have questions about using ZeroscopeTTV, please refer to the Diffusers library documentation or reach out to their support team for further assistance. Enjoy creating videos with ZeroscopeTTV! \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index de263ac6..0bca64c6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -99,6 +99,7 @@ nav: - ElevenLabsText2SpeechTool: "swarms/models/elevenlabs.md" - OpenAITTS: "swarms/models/openai_tts.md" - Gemini: "swarms/models/gemini.md" + - ZeroscopeTTV: "swarms/models/zeroscope.md" - swarms.structs: - Overview: "swarms/structs/overview.md" - AutoScaler: "swarms/swarms/autoscaler.md" diff --git a/pyproject.toml b/pyproject.toml index cf214581..d9c6a9dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "swarms" -version = "2.4.2" +version = "2.6.0" description = "Swarms - Pytorch" license = "MIT" authors = ["Kye Gomez "] diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index b66eb1d3..cc02b4f4 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -17,8 +17,9 @@ from swarms.models.wizard_storytelling import ( WizardLLMStoryTeller, ) # noqa: E402 from swarms.models.mpt import MPT7B # noqa: E402 +from swarms.models.mixtral import Mixtral # noqa: E402 -# MultiModal Models +################# MultiModal Models from swarms.models.base_multimodal_model import ( BaseMultiModalModel, ) # noqa: E402 @@ -32,6 +33,7 @@ from swarms.models.gpt4_vision_api import GPT4VisionAPI # noqa: E402 from swarms.models.openai_tts import OpenAITTS # noqa: E402 from swarms.models.gemini import Gemini # noqa: E402 from swarms.models.gigabind import Gigabind # noqa: E402 +from swarms.models.zeroscope import ZeroscopeTTV # noqa: E402 # from swarms.models.gpt4v import GPT4Vision # from swarms.models.dalle3 import Dalle3 @@ -66,4 +68,6 @@ __all__ = [ "OpenAITTS", "Gemini", "Gigabind", + "Mixtral", + "ZeroscopeTTV", ] diff --git a/swarms/models/base_ttv.py b/swarms/models/base_ttv.py new file mode 100644 index 00000000..cc301107 --- /dev/null +++ b/swarms/models/base_ttv.py @@ -0,0 +1,115 @@ +from abc import ABC, abstractmethod +from swarms.models.base_llm import AbstractLLM +from diffusers.utils import export_to_video +from typing import Optional, List +import asyncio +from concurrent.futures import ThreadPoolExecutor + + +class BaseTextToVideo(AbstractLLM): + """BaseTextToVideo class represents prebuilt text-to-video models.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @abstractmethod + def run(self, *args, **kwargs): + pass + + def __call__( + self, + task: Optional[str] = None, + img: Optional[str] = None, + *args, + **kwargs, + ): + """ + Performs forward pass on the input task and returns the path of the generated video. + + Args: + task (str): The task to perform. + + Returns: + str: The path of the generated video. + """ + return self.run(task, img, *args, **kwargs) + + def save_video_path( + self, video_path: Optional[str] = None, *args, **kwargs + ): + """Saves the generated video to the specified path. + + Args: + video_path (Optional[str], optional): _description_. Defaults to None. + + Returns: + str: The path of the generated video. + """ + return export_to_video(video_path, *args, **kwargs) + + def run_batched( + self, + tasks: List[str] = None, + imgs: List[str] = None, + *args, + **kwargs, + ): + # TODO: Implement batched inference + tasks = tasks or [] + imgs = imgs or [] + if len(tasks) != len(imgs): + raise ValueError( + "The number of tasks and images should be the same." + ) + return [ + self.run(task, img, *args, **kwargs) + for task, img in zip(tasks, imgs) + ] + + def run_concurrent_batched( + self, + tasks: List[str] = None, + imgs: List[str] = None, + *args, + **kwargs, + ): + tasks = tasks or [] + imgs = imgs or [] + if len(tasks) != len(imgs): + raise ValueError( + "The number of tasks and images should be the same." + ) + with ThreadPoolExecutor(max_workers=4) as executor: + loop = asyncio.get_event_loop() + tasks = [ + loop.run_in_executor( + executor, self.run, task, img, *args, **kwargs + ) + for task, img in zip(tasks, imgs) + ] + return loop.run_until_complete(asyncio.gather(*tasks)) + + # Run the model in async mode + def arun( + self, + task: Optional[str] = None, + img: Optional[str] = None, + *args, + **kwargs, + ): + loop = asyncio.get_event_loop() + return loop.run_until_complete( + self.run(task, img, *args, **kwargs) + ) + + def arun_batched( + self, + tasks: List[str] = None, + imgs: List[str] = None, + *args, + **kwargs, + ): + loop = asyncio.get_event_loop() + return loop.run_until_complete( + self.run_batched(tasks, imgs, *args, **kwargs) + ) diff --git a/swarms/models/dalle3.py b/swarms/models/dalle3.py index 40f63418..6b225b49 100644 --- a/swarms/models/dalle3.py +++ b/swarms/models/dalle3.py @@ -18,8 +18,6 @@ from termcolor import colored load_dotenv() -# api_key = os.getenv("OPENAI_API_KEY") - # Configure Logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/swarms/models/zeroscope.py b/swarms/models/zeroscope.py new file mode 100644 index 00000000..8ddc28ad --- /dev/null +++ b/swarms/models/zeroscope.py @@ -0,0 +1,103 @@ +import torch +from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler +from diffusers.utils import export_to_video + + +class ZeroscopeTTV: + """ + ZeroscopeTTV class represents a zero-shot video generation model. + + Args: + model_name (str): The name of the pre-trained model to use. + torch_dtype (torch.dtype): The torch data type to use for computations. + chunk_size (int): The size of chunks for forward chunking. + dim (int): The dimension along which to split the input for forward chunking. + num_inference_steps (int): The number of inference steps to perform. + height (int): The height of the video frames. + width (int): The width of the video frames. + num_frames (int): The number of frames in the video. + + Attributes: + model_name (str): The name of the pre-trained model. + torch_dtype (torch.dtype): The torch data type used for computations. + chunk_size (int): The size of chunks for forward chunking. + dim (int): The dimension along which the input is split for forward chunking. + num_inference_steps (int): The number of inference steps to perform. + height (int): The height of the video frames. + width (int): The width of the video frames. + num_frames (int): The number of frames in the video. + pipe (DiffusionPipeline): The diffusion pipeline for video generation. + + Methods: + forward(task: str = None, *args, **kwargs) -> str: + Performs forward pass on the input task and returns the path of the generated video. + + Examples: + >>> from swarms.models + >>> zeroscope = ZeroscopeTTV() + >>> task = "A person is walking on the street." + >>> video_path = zeroscope(task) + + """ + + def __init__( + self, + model_name: str = "cerspense/zeroscope_v2_576w", + torch_dtype=torch.float16, + chunk_size: int = 1, + dim: int = 1, + num_inference_steps: int = 40, + height: int = 320, + width: int = 576, + num_frames: int = 36, + *args, + **kwargs, + ): + self.model_name = model_name + self.torch_dtype = torch_dtype + self.chunk_size = chunk_size + self.dim = dim + self.num_inference_steps = num_inference_steps + self.height = height + self.width = width + self.num_frames = num_frames + + self.pipe = DiffusionPipeline.from_pretrained( + model_name, + torch_dtype=torch_dtype, + *args, + ) + self.pipe.scheduler = DPMSolverMultistepScheduler( + self.pipe.scheduler.config, + ) + self.pipe_enable_model_cpu_offload() + self.pipe.enable_vae_slicing() + self.pipe.unet.enable_forward_chunking( + chunk_size=chunk_size, dim=dim + ) + + def run(self, task: str = None, *args, **kwargs): + """ + Performs a forward pass on the input task and returns the path of the generated video. + + Args: + task (str): The input task for video generation. + + Returns: + str: The path of the generated video. + """ + try: + video_frames = self.pipe( + task, + num_inference_steps=self.num_inference_steps, + height=self.height, + width=self.width, + num_frames=self.num_frames, + *args, + **kwargs + ).frames + video_path = export_to_video(video_frames) + return video_path + except Exception as error: + print(f"Error in [ZeroscopeTTV.forward]: {error}") + raise error diff --git a/tests/models/test_zeroscope.py b/tests/models/test_zeroscope.py new file mode 100644 index 00000000..25a4c597 --- /dev/null +++ b/tests/models/test_zeroscope.py @@ -0,0 +1,122 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from swarms.models.zeroscope import ZeroscopeTTV + + +@patch("swarms.models.zeroscope.DiffusionPipeline") +@patch("swarms.models.zeroscope.DPMSolverMultistepScheduler") +def test_zeroscope_ttv_init(mock_scheduler, mock_pipeline): + zeroscope = ZeroscopeTTV() + mock_pipeline.from_pretrained.assert_called_once() + mock_scheduler.assert_called_once() + assert zeroscope.model_name == "cerspense/zeroscope_v2_576w" + assert zeroscope.chunk_size == 1 + assert zeroscope.dim == 1 + assert zeroscope.num_inference_steps == 40 + assert zeroscope.height == 320 + assert zeroscope.width == 576 + assert zeroscope.num_frames == 36 + + +@patch("swarms.models.zeroscope.DiffusionPipeline") +@patch("swarms.models.zeroscope.DPMSolverMultistepScheduler") +def test_zeroscope_ttv_forward(mock_scheduler, mock_pipeline): + zeroscope = ZeroscopeTTV() + mock_pipeline_instance = MagicMock() + mock_pipeline.from_pretrained.return_value = ( + mock_pipeline_instance + ) + mock_pipeline_instance.return_value = MagicMock( + frames="Generated frames" + ) + mock_pipeline_instance.enable_vae_slicing.assert_called_once() + mock_pipeline_instance.enable_forward_chunking.assert_called_once_with( + chunk_size=1, dim=1 + ) + result = zeroscope.forward("Test task") + assert result == "Generated frames" + mock_pipeline_instance.assert_called_once_with( + "Test task", + num_inference_steps=40, + height=320, + width=576, + num_frames=36, + ) + + +@patch("swarms.models.zeroscope.DiffusionPipeline") +@patch("swarms.models.zeroscope.DPMSolverMultistepScheduler") +def test_zeroscope_ttv_forward_error(mock_scheduler, mock_pipeline): + zeroscope = ZeroscopeTTV() + mock_pipeline_instance = MagicMock() + mock_pipeline.from_pretrained.return_value = ( + mock_pipeline_instance + ) + mock_pipeline_instance.return_value = MagicMock( + frames="Generated frames" + ) + mock_pipeline_instance.side_effect = Exception("Test error") + with pytest.raises(Exception, match="Test error"): + zeroscope.forward("Test task") + + +@patch("swarms.models.zeroscope.DiffusionPipeline") +@patch("swarms.models.zeroscope.DPMSolverMultistepScheduler") +def test_zeroscope_ttv_call(mock_scheduler, mock_pipeline): + zeroscope = ZeroscopeTTV() + mock_pipeline_instance = MagicMock() + mock_pipeline.from_pretrained.return_value = ( + mock_pipeline_instance + ) + mock_pipeline_instance.return_value = MagicMock( + frames="Generated frames" + ) + result = zeroscope.__call__("Test task") + assert result == "Generated frames" + mock_pipeline_instance.assert_called_once_with( + "Test task", + num_inference_steps=40, + height=320, + width=576, + num_frames=36, + ) + + +@patch("swarms.models.zeroscope.DiffusionPipeline") +@patch("swarms.models.zeroscope.DPMSolverMultistepScheduler") +def test_zeroscope_ttv_call_error(mock_scheduler, mock_pipeline): + zeroscope = ZeroscopeTTV() + mock_pipeline_instance = MagicMock() + mock_pipeline.from_pretrained.return_value = ( + mock_pipeline_instance + ) + mock_pipeline_instance.return_value = MagicMock( + frames="Generated frames" + ) + mock_pipeline_instance.side_effect = Exception("Test error") + with pytest.raises(Exception, match="Test error"): + zeroscope.__call__("Test task") + + +@patch("swarms.models.zeroscope.DiffusionPipeline") +@patch("swarms.models.zeroscope.DPMSolverMultistepScheduler") +def test_zeroscope_ttv_save_video_path(mock_scheduler, mock_pipeline): + zeroscope = ZeroscopeTTV() + mock_pipeline_instance = MagicMock() + mock_pipeline.from_pretrained.return_value = ( + mock_pipeline_instance + ) + mock_pipeline_instance.return_value = MagicMock( + frames="Generated frames" + ) + result = zeroscope.save_video_path("Test video path") + assert result == "Test video path" + mock_pipeline_instance.assert_called_once_with( + "Test video path", + num_inference_steps=40, + height=320, + width=576, + num_frames=36, + )