pull/362/head^2
Kye 12 months ago
parent d5c0ca0128
commit e70b401b54

@ -824,6 +824,115 @@ out = llm.run(task=task, img=img)
print(out)
```
### `GPT4Vision`
```python
from swarms import GPT4VisionAPI
# Initialize with default API key and custom max_tokens
api = GPT4VisionAPI(max_tokens=1000)
# Define the task and image URL
task = "Describe the scene in the image."
img = "https://i.imgur.com/4P4ZRxU.jpeg"
# Run the GPT-4 Vision model
response = api.run(task, img)
# Print the model's response
print(response)
```
### `QwenVLMultiModal`
A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true!
```python
from swarms import QwenVLMultiModal
# Instantiate the QwenVLMultiModal model
model = QwenVLMultiModal(
model_name="Qwen/Qwen-VL-Chat",
device="cuda",
quantize=True,
)
# Run the model
response = model(
"Hello, how are you?", "https://example.com/image.jpg"
)
# Print the response
print(response)
```
### `Kosmos`
- Multi-Modal Model from microsoft!
```python
from swarms import Kosmos
# Initialize the model
model = Kosmos()
# Generate
out = model.run("Analyze the reciepts in this image", "docs.jpg")
# Print the output
print(out)
```
### `Idefics`
- Multi-Modal model from Huggingface team!
```python
# Import the idefics model from the swarms.models module
from swarms.models import Idefics
# Create an instance of the idefics model
model = Idefics()
# Define user input with an image URL and chat with the model
user_input = (
"User: What is in this image?"
" https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
)
response = model.chat(user_input)
print(response)
# Define another user input with an image URL and chat with the model
user_input = (
"User: And who is that?"
" https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"
)
response = model.chat(user_input)
print(response)
# Set the checkpoint of the model to "new_checkpoint"
model.set_checkpoint("new_checkpoint")
# Set the device of the model to "cpu"
model.set_device("cpu")
# Set the maximum length of the chat to 200
model.set_max_length(200)
# Clear the chat history of the model
model.clear_chat_history()
```
## Radically Simple AI Model APIs
We provide a vast array of language and multi-modal model APIs for you to generate text, images, music, speech, and even videos. Get started below:
-----
### `Anthropic`
```python
@ -900,23 +1009,6 @@ print(image_url)
```
### `GPT4Vision`
```python
from swarms import GPT4VisionAPI
# Initialize with default API key and custom max_tokens
api = GPT4VisionAPI(max_tokens=1000)
# Define the task and image URL
task = "Describe the scene in the image."
img = "https://i.imgur.com/4P4ZRxU.jpeg"
# Run the GPT-4 Vision model
response = api.run(task, img)
# Print the model's response
print(response)
```
### Text to Video with `ZeroscopeTTV`
@ -938,7 +1030,7 @@ print(video_path)
```
### ModelScope
<!-- ### ModelScope
```python
from swarms.models import ModelScopeAutoModel
@ -960,32 +1052,9 @@ cog_agent = CogAgent()
# Run the model on the tests
cog_agent.run("Describe this scene", "images/1.jpg")
```
``` -->
### `QwenVLMultiModal`
A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true!
```python
from swarms import QwenVLMultiModal
# Instantiate the QwenVLMultiModal model
model = QwenVLMultiModal(
model_name="Qwen/Qwen-VL-Chat",
device="cuda",
quantize=True,
)
# Run the model
response = model(
"Hello, how are you?", "https://example.com/image.jpg"
)
# Print the response
print(response)
```
----

@ -1,7 +1,10 @@
from swarms.models import idefics
# Import the idefics model from the swarms.models module
from swarms.models import Idefics
model = idefics()
# Create an instance of the idefics model
model = Idefics()
# Define user input with an image URL and chat with the model
user_input = (
"User: What is in this image?"
" https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
@ -9,6 +12,7 @@ user_input = (
response = model.chat(user_input)
print(response)
# Define another user input with an image URL and chat with the model
user_input = (
"User: And who is that?"
" https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"
@ -16,7 +20,14 @@ user_input = (
response = model.chat(user_input)
print(response)
# Set the checkpoint of the model to "new_checkpoint"
model.set_checkpoint("new_checkpoint")
# Set the device of the model to "cpu"
model.set_device("cpu")
# Set the maximum length of the chat to 200
model.set_max_length(200)
# Clear the chat history of the model
model.clear_chat_history()

@ -0,0 +1,10 @@
from swarms import Kosmos
# Initialize the model
model = Kosmos()
# Generate
out = model.run("Analyze the reciepts in this image", "docs.jpg")
# Print the output
print(out)

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "swarms"
version = "3.8.1"
version = "3.8.2"
description = "Swarms - Pytorch"
license = "MIT"
authors = ["Kye Gomez <kye@apac.ai>"]

@ -48,6 +48,8 @@ from swarms.models.vip_llava import VipLlavaMultiModal # noqa: E402
from swarms.models.llava import LavaMultiModal # noqa: E402
from swarms.models.qwen import QwenVLMultiModal # noqa: E402
from swarms.models.clipq import CLIPQ # noqa: E402
from swarms.models.kosmos_two import Kosmos # noqa: E402
from swarms.models.fuyu import Fuyu # noqa: E402
# from swarms.models.dalle3 import Dalle3
# from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -79,7 +81,6 @@ __all__ = [
"Zephyr",
"BaseMultiModalModel",
"Idefics",
# "Kosmos",
"Vilt",
"Nougat",
"LayoutLMDocumentQA",
@ -102,9 +103,6 @@ __all__ = [
"AudioModality",
"VideoModality",
"MultimodalData",
# "CogAgent",
# "ModelScopePipeline",
# "ModelScopeAutoModel",
"TogetherLLM",
"TimmModel",
"UltralyticsModel",
@ -112,4 +110,6 @@ __all__ = [
"LavaMultiModal",
"QwenVLMultiModal",
"CLIPQ",
"Kosmos",
"Fuyu",
]

Loading…
Cancel
Save