diff --git a/README.md b/README.md index 64db98ae..aa8822e8 100644 --- a/README.md +++ b/README.md @@ -824,6 +824,115 @@ out = llm.run(task=task, img=img) print(out) ``` +### `GPT4Vision` +```python +from swarms import GPT4VisionAPI + +# Initialize with default API key and custom max_tokens +api = GPT4VisionAPI(max_tokens=1000) + +# Define the task and image URL +task = "Describe the scene in the image." +img = "https://i.imgur.com/4P4ZRxU.jpeg" + +# Run the GPT-4 Vision model +response = api.run(task, img) + +# Print the model's response +print(response) +``` + +### `QwenVLMultiModal` +A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true! + +```python +from swarms import QwenVLMultiModal + +# Instantiate the QwenVLMultiModal model +model = QwenVLMultiModal( + model_name="Qwen/Qwen-VL-Chat", + device="cuda", + quantize=True, +) + +# Run the model +response = model( + "Hello, how are you?", "https://example.com/image.jpg" +) + +# Print the response +print(response) + + +``` + + +### `Kosmos` +- Multi-Modal Model from microsoft! + +```python +from swarms import Kosmos + +# Initialize the model +model = Kosmos() + +# Generate +out = model.run("Analyze the reciepts in this image", "docs.jpg") + +# Print the output +print(out) + +``` + + +### `Idefics` +- Multi-Modal model from Huggingface team! + +```python +# Import the idefics model from the swarms.models module +from swarms.models import Idefics + +# Create an instance of the idefics model +model = Idefics() + +# Define user input with an image URL and chat with the model +user_input = ( + "User: What is in this image?" + " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG" +) +response = model.chat(user_input) +print(response) + +# Define another user input with an image URL and chat with the model +user_input = ( + "User: And who is that?" + " https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052" +) +response = model.chat(user_input) +print(response) + +# Set the checkpoint of the model to "new_checkpoint" +model.set_checkpoint("new_checkpoint") + +# Set the device of the model to "cpu" +model.set_device("cpu") + +# Set the maximum length of the chat to 200 +model.set_max_length(200) + +# Clear the chat history of the model +model.clear_chat_history() + + +``` + +## Radically Simple AI Model APIs +We provide a vast array of language and multi-modal model APIs for you to generate text, images, music, speech, and even videos. Get started below: + + + +----- + ### `Anthropic` ```python @@ -900,23 +1009,6 @@ print(image_url) ``` -### `GPT4Vision` -```python -from swarms import GPT4VisionAPI - -# Initialize with default API key and custom max_tokens -api = GPT4VisionAPI(max_tokens=1000) - -# Define the task and image URL -task = "Describe the scene in the image." -img = "https://i.imgur.com/4P4ZRxU.jpeg" - -# Run the GPT-4 Vision model -response = api.run(task, img) - -# Print the model's response -print(response) -``` ### Text to Video with `ZeroscopeTTV` @@ -938,7 +1030,7 @@ print(video_path) ``` -### ModelScope + -### `QwenVLMultiModal` -A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true! - -```python -from swarms import QwenVLMultiModal - -# Instantiate the QwenVLMultiModal model -model = QwenVLMultiModal( - model_name="Qwen/Qwen-VL-Chat", - device="cuda", - quantize=True, -) - -# Run the model -response = model( - "Hello, how are you?", "https://example.com/image.jpg" -) - -# Print the response -print(response) - - -``` ---- diff --git a/playground/models/idefics.py b/playground/models/idefics.py index 39d6f4eb..ea36ba77 100644 --- a/playground/models/idefics.py +++ b/playground/models/idefics.py @@ -1,7 +1,10 @@ -from swarms.models import idefics +# Import the idefics model from the swarms.models module +from swarms.models import Idefics -model = idefics() +# Create an instance of the idefics model +model = Idefics() +# Define user input with an image URL and chat with the model user_input = ( "User: What is in this image?" " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG" @@ -9,6 +12,7 @@ user_input = ( response = model.chat(user_input) print(response) +# Define another user input with an image URL and chat with the model user_input = ( "User: And who is that?" " https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052" @@ -16,7 +20,14 @@ user_input = ( response = model.chat(user_input) print(response) +# Set the checkpoint of the model to "new_checkpoint" model.set_checkpoint("new_checkpoint") + +# Set the device of the model to "cpu" model.set_device("cpu") + +# Set the maximum length of the chat to 200 model.set_max_length(200) + +# Clear the chat history of the model model.clear_chat_history() diff --git a/playground/models/kosmos.py b/playground/models/kosmos.py new file mode 100644 index 00000000..3d0f1dd2 --- /dev/null +++ b/playground/models/kosmos.py @@ -0,0 +1,10 @@ +from swarms import Kosmos + +# Initialize the model +model = Kosmos() + +# Generate +out = model.run("Analyze the reciepts in this image", "docs.jpg") + +# Print the output +print(out) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7a45a177..cd5f9f74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "swarms" -version = "3.8.1" +version = "3.8.2" description = "Swarms - Pytorch" license = "MIT" authors = ["Kye Gomez "] diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index fcd67dc6..a8fb119a 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -48,6 +48,8 @@ from swarms.models.vip_llava import VipLlavaMultiModal # noqa: E402 from swarms.models.llava import LavaMultiModal # noqa: E402 from swarms.models.qwen import QwenVLMultiModal # noqa: E402 from swarms.models.clipq import CLIPQ # noqa: E402 +from swarms.models.kosmos_two import Kosmos # noqa: E402 +from swarms.models.fuyu import Fuyu # noqa: E402 # from swarms.models.dalle3 import Dalle3 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402 @@ -79,7 +81,6 @@ __all__ = [ "Zephyr", "BaseMultiModalModel", "Idefics", - # "Kosmos", "Vilt", "Nougat", "LayoutLMDocumentQA", @@ -102,9 +103,6 @@ __all__ = [ "AudioModality", "VideoModality", "MultimodalData", - # "CogAgent", - # "ModelScopePipeline", - # "ModelScopeAutoModel", "TogetherLLM", "TimmModel", "UltralyticsModel", @@ -112,4 +110,6 @@ __all__ = [ "LavaMultiModal", "QwenVLMultiModal", "CLIPQ", + "Kosmos", + "Fuyu", ]