pull/362/head^2
Kye 1 year ago
parent d5c0ca0128
commit e70b401b54

@ -824,6 +824,115 @@ out = llm.run(task=task, img=img)
print(out) print(out)
``` ```
### `GPT4Vision`
```python
from swarms import GPT4VisionAPI
# Initialize with default API key and custom max_tokens
api = GPT4VisionAPI(max_tokens=1000)
# Define the task and image URL
task = "Describe the scene in the image."
img = "https://i.imgur.com/4P4ZRxU.jpeg"
# Run the GPT-4 Vision model
response = api.run(task, img)
# Print the model's response
print(response)
```
### `QwenVLMultiModal`
A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true!
```python
from swarms import QwenVLMultiModal
# Instantiate the QwenVLMultiModal model
model = QwenVLMultiModal(
model_name="Qwen/Qwen-VL-Chat",
device="cuda",
quantize=True,
)
# Run the model
response = model(
"Hello, how are you?", "https://example.com/image.jpg"
)
# Print the response
print(response)
```
### `Kosmos`
- Multi-Modal Model from microsoft!
```python
from swarms import Kosmos
# Initialize the model
model = Kosmos()
# Generate
out = model.run("Analyze the reciepts in this image", "docs.jpg")
# Print the output
print(out)
```
### `Idefics`
- Multi-Modal model from Huggingface team!
```python
# Import the idefics model from the swarms.models module
from swarms.models import Idefics
# Create an instance of the idefics model
model = Idefics()
# Define user input with an image URL and chat with the model
user_input = (
"User: What is in this image?"
" https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
)
response = model.chat(user_input)
print(response)
# Define another user input with an image URL and chat with the model
user_input = (
"User: And who is that?"
" https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"
)
response = model.chat(user_input)
print(response)
# Set the checkpoint of the model to "new_checkpoint"
model.set_checkpoint("new_checkpoint")
# Set the device of the model to "cpu"
model.set_device("cpu")
# Set the maximum length of the chat to 200
model.set_max_length(200)
# Clear the chat history of the model
model.clear_chat_history()
```
## Radically Simple AI Model APIs
We provide a vast array of language and multi-modal model APIs for you to generate text, images, music, speech, and even videos. Get started below:
-----
### `Anthropic` ### `Anthropic`
```python ```python
@ -900,23 +1009,6 @@ print(image_url)
``` ```
### `GPT4Vision`
```python
from swarms import GPT4VisionAPI
# Initialize with default API key and custom max_tokens
api = GPT4VisionAPI(max_tokens=1000)
# Define the task and image URL
task = "Describe the scene in the image."
img = "https://i.imgur.com/4P4ZRxU.jpeg"
# Run the GPT-4 Vision model
response = api.run(task, img)
# Print the model's response
print(response)
```
### Text to Video with `ZeroscopeTTV` ### Text to Video with `ZeroscopeTTV`
@ -938,7 +1030,7 @@ print(video_path)
``` ```
### ModelScope <!-- ### ModelScope
```python ```python
from swarms.models import ModelScopeAutoModel from swarms.models import ModelScopeAutoModel
@ -960,32 +1052,9 @@ cog_agent = CogAgent()
# Run the model on the tests # Run the model on the tests
cog_agent.run("Describe this scene", "images/1.jpg") cog_agent.run("Describe this scene", "images/1.jpg")
``` ``` -->
### `QwenVLMultiModal`
A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true!
```python
from swarms import QwenVLMultiModal
# Instantiate the QwenVLMultiModal model
model = QwenVLMultiModal(
model_name="Qwen/Qwen-VL-Chat",
device="cuda",
quantize=True,
)
# Run the model
response = model(
"Hello, how are you?", "https://example.com/image.jpg"
)
# Print the response
print(response)
```
---- ----

@ -1,7 +1,10 @@
from swarms.models import idefics # Import the idefics model from the swarms.models module
from swarms.models import Idefics
model = idefics() # Create an instance of the idefics model
model = Idefics()
# Define user input with an image URL and chat with the model
user_input = ( user_input = (
"User: What is in this image?" "User: What is in this image?"
" https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG" " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
@ -9,6 +12,7 @@ user_input = (
response = model.chat(user_input) response = model.chat(user_input)
print(response) print(response)
# Define another user input with an image URL and chat with the model
user_input = ( user_input = (
"User: And who is that?" "User: And who is that?"
" https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052" " https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"
@ -16,7 +20,14 @@ user_input = (
response = model.chat(user_input) response = model.chat(user_input)
print(response) print(response)
# Set the checkpoint of the model to "new_checkpoint"
model.set_checkpoint("new_checkpoint") model.set_checkpoint("new_checkpoint")
# Set the device of the model to "cpu"
model.set_device("cpu") model.set_device("cpu")
# Set the maximum length of the chat to 200
model.set_max_length(200) model.set_max_length(200)
# Clear the chat history of the model
model.clear_chat_history() model.clear_chat_history()

@ -0,0 +1,10 @@
from swarms import Kosmos
# Initialize the model
model = Kosmos()
# Generate
out = model.run("Analyze the reciepts in this image", "docs.jpg")
# Print the output
print(out)

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "swarms" name = "swarms"
version = "3.8.1" version = "3.8.2"
description = "Swarms - Pytorch" description = "Swarms - Pytorch"
license = "MIT" license = "MIT"
authors = ["Kye Gomez <kye@apac.ai>"] authors = ["Kye Gomez <kye@apac.ai>"]

@ -48,6 +48,8 @@ from swarms.models.vip_llava import VipLlavaMultiModal # noqa: E402
from swarms.models.llava import LavaMultiModal # noqa: E402 from swarms.models.llava import LavaMultiModal # noqa: E402
from swarms.models.qwen import QwenVLMultiModal # noqa: E402 from swarms.models.qwen import QwenVLMultiModal # noqa: E402
from swarms.models.clipq import CLIPQ # noqa: E402 from swarms.models.clipq import CLIPQ # noqa: E402
from swarms.models.kosmos_two import Kosmos # noqa: E402
from swarms.models.fuyu import Fuyu # noqa: E402
# from swarms.models.dalle3 import Dalle3 # from swarms.models.dalle3 import Dalle3
# from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -79,7 +81,6 @@ __all__ = [
"Zephyr", "Zephyr",
"BaseMultiModalModel", "BaseMultiModalModel",
"Idefics", "Idefics",
# "Kosmos",
"Vilt", "Vilt",
"Nougat", "Nougat",
"LayoutLMDocumentQA", "LayoutLMDocumentQA",
@ -102,9 +103,6 @@ __all__ = [
"AudioModality", "AudioModality",
"VideoModality", "VideoModality",
"MultimodalData", "MultimodalData",
# "CogAgent",
# "ModelScopePipeline",
# "ModelScopeAutoModel",
"TogetherLLM", "TogetherLLM",
"TimmModel", "TimmModel",
"UltralyticsModel", "UltralyticsModel",
@ -112,4 +110,6 @@ __all__ = [
"LavaMultiModal", "LavaMultiModal",
"QwenVLMultiModal", "QwenVLMultiModal",
"CLIPQ", "CLIPQ",
"Kosmos",
"Fuyu",
] ]

Loading…
Cancel
Save