[README]

12 months ago · e70b401b54
parent d5c0ca0128
commit e70b401b54
5 changed files with 139 additions and 49 deletions
--- a/README.md
+++ b/README.md
@ -824,6 +824,115 @@ out = llm.run(task=task, img=img)
 print(out)
 ```

+### `GPT4Vision`
+```python
+from swarms import GPT4VisionAPI
+
+# Initialize with default API key and custom max_tokens
+api = GPT4VisionAPI(max_tokens=1000)
+
+# Define the task and image URL
+task = "Describe the scene in the image."
+img = "https://i.imgur.com/4P4ZRxU.jpeg"
+
+# Run the GPT-4 Vision model
+response = api.run(task, img)
+
+# Print the model's response
+print(response)
+```
+
+### `QwenVLMultiModal`
+A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true!
+
+```python
+from swarms import QwenVLMultiModal
+
+# Instantiate the QwenVLMultiModal model
+model = QwenVLMultiModal(
+    model_name="Qwen/Qwen-VL-Chat",
+    device="cuda",
+    quantize=True,
+)
+
+# Run the model
+response = model(
+    "Hello, how are you?", "https://example.com/image.jpg"
+)
+
+# Print the response
+print(response)
+
+
+```
+
+
+### `Kosmos`
+- Multi-Modal Model from microsoft!
+
+```python
+from swarms import Kosmos
+
+# Initialize the model
+model = Kosmos()
+
+# Generate
+out = model.run("Analyze the reciepts in this image", "docs.jpg")
+
+# Print the output
+print(out)
+
+```
+
+
+### `Idefics`
+- Multi-Modal model from Huggingface team!
+
+```python
+# Import the idefics model from the swarms.models module
+from swarms.models import Idefics
+
+# Create an instance of the idefics model
+model = Idefics()
+
+# Define user input with an image URL and chat with the model
+user_input = (
+    "User: What is in this image?"
+    " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
+)
+response = model.chat(user_input)
+print(response)
+
+# Define another user input with an image URL and chat with the model
+user_input = (
+    "User: And who is that?"
+    " https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"
+)
+response = model.chat(user_input)
+print(response)
+
+# Set the checkpoint of the model to "new_checkpoint"
+model.set_checkpoint("new_checkpoint")
+
+# Set the device of the model to "cpu"
+model.set_device("cpu")
+
+# Set the maximum length of the chat to 200
+model.set_max_length(200)
+
+# Clear the chat history of the model
+model.clear_chat_history()
+
+
+```
+
+## Radically Simple AI Model APIs
+We provide a vast array of language and multi-modal model APIs for you to generate text, images, music, speech, and even videos. Get started below:
+
+
+
+-----
+

 ### `Anthropic`
 ```python
@ -900,23 +1009,6 @@ print(image_url)
 ```


-### `GPT4Vision`
-```python
-from swarms import GPT4VisionAPI
-
-# Initialize with default API key and custom max_tokens
-api = GPT4VisionAPI(max_tokens=1000)
-
-# Define the task and image URL
-task = "Describe the scene in the image."
-img = "https://i.imgur.com/4P4ZRxU.jpeg"
-
-# Run the GPT-4 Vision model
-response = api.run(task, img)
-
-# Print the model's response
-print(response)
-```


 ### Text to Video with `ZeroscopeTTV`
@ -938,7 +1030,7 @@ print(video_path)
 ```


-### ModelScope
+<!-- ### ModelScope
 ```python
 from swarms.models import ModelScopeAutoModel

@ -960,32 +1052,9 @@ cog_agent = CogAgent()
 # Run the model on the tests
 cog_agent.run("Describe this scene", "images/1.jpg")

-```
+``` -->


-### `QwenVLMultiModal`
-A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true!
-
-```python
-from swarms import QwenVLMultiModal
-
-# Instantiate the QwenVLMultiModal model
-model = QwenVLMultiModal(
-    model_name="Qwen/Qwen-VL-Chat",
-    device="cuda",
-    quantize=True,
-)
-
-# Run the model
-response = model(
-    "Hello, how are you?", "https://example.com/image.jpg"
-)
-
-# Print the response
-print(response)
-
-
-```

 ----

--- a/playground/models/idefics.py
+++ b/playground/models/idefics.py
@ -1,7 +1,10 @@
-from swarms.models import idefics
+# Import the idefics model from the swarms.models module
+from swarms.models import Idefics

-model = idefics()
+# Create an instance of the idefics model
+model = Idefics()

+# Define user input with an image URL and chat with the model
 user_input = (
    "User: What is in this image?"
    " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
@ -9,6 +12,7 @@ user_input = (
 response = model.chat(user_input)
 print(response)

+# Define another user input with an image URL and chat with the model
 user_input = (
    "User: And who is that?"
    " https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"
@ -16,7 +20,14 @@ user_input = (
 response = model.chat(user_input)
 print(response)

+# Set the checkpoint of the model to "new_checkpoint"
 model.set_checkpoint("new_checkpoint")
+
+# Set the device of the model to "cpu"
 model.set_device("cpu")
+
+# Set the maximum length of the chat to 200
 model.set_max_length(200)
+
+# Clear the chat history of the model
 model.clear_chat_history()
--- a/playground/models/kosmos.py
+++ b/playground/models/kosmos.py
@ -0,0 +1,10 @@
+from swarms import Kosmos
+
+# Initialize the model
+model = Kosmos()
+
+# Generate
+out = model.run("Analyze the reciepts in this image", "docs.jpg")
+
+# Print the output
+print(out)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

 [tool.poetry]
 name = "swarms"
-version = "3.8.1"
+version = "3.8.2"
 description = "Swarms - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -48,6 +48,8 @@ from swarms.models.vip_llava import VipLlavaMultiModal  # noqa: E402
 from swarms.models.llava import LavaMultiModal  # noqa: E402
 from swarms.models.qwen import QwenVLMultiModal  # noqa: E402
 from swarms.models.clipq import CLIPQ  # noqa: E402
+from swarms.models.kosmos_two import Kosmos  # noqa: E402   
+from swarms.models.fuyu import Fuyu  # noqa: E402

 # from swarms.models.dalle3 import Dalle3
 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -79,7 +81,6 @@ __all__ = [
    "Zephyr",
    "BaseMultiModalModel",
    "Idefics",
-    # "Kosmos",
    "Vilt",
    "Nougat",
    "LayoutLMDocumentQA",
@ -102,9 +103,6 @@ __all__ = [
    "AudioModality",
    "VideoModality",
    "MultimodalData",
-    # "CogAgent",
-    # "ModelScopePipeline",
-    # "ModelScopeAutoModel",
    "TogetherLLM",
    "TimmModel",
    "UltralyticsModel",
@ -112,4 +110,6 @@ __all__ = [
    "LavaMultiModal",
    "QwenVLMultiModal",
    "CLIPQ",
+    "Kosmos",
+    "Fuyu",
 ]