[README]

2 years ago · e70b401b54
parent d5c0ca0128
commit e70b401b54
5 changed files with 139 additions and 49 deletions
--- a/README.md
+++ b/README.md
@ -824,6 +824,115 @@ out = llm.run(task=task, img=img)
 print(out)
 ```
 ### `GPT4Vision`
 ```python
 from swarms import GPT4VisionAPI
 # Initialize with default API key and custom max_tokens
 api = GPT4VisionAPI(max_tokens=1000)
 # Define the task and image URL
 task = "Describe the scene in the image."
 img = "https://i.imgur.com/4P4ZRxU.jpeg"
 # Run the GPT-4 Vision model
 response = api.run(task, img)
 # Print the model's response
 print(response)
 ```
 ### `QwenVLMultiModal`
 A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true!
 ```python
 from swarms import QwenVLMultiModal
 # Instantiate the QwenVLMultiModal model
 model = QwenVLMultiModal(
    model_name="Qwen/Qwen-VL-Chat",
    device="cuda",
    quantize=True,
 )
 # Run the model
 response = model(
    "Hello, how are you?", "https://example.com/image.jpg"
 )
 # Print the response
 print(response)
 ```
 ### `Kosmos`
 - Multi-Modal Model from microsoft!
 ```python
 from swarms import Kosmos
 # Initialize the model
 model = Kosmos()
 # Generate
 out = model.run("Analyze the reciepts in this image", "docs.jpg")
 # Print the output
 print(out)
 ```
 ### `Idefics`
 - Multi-Modal model from Huggingface team!
 ```python
 # Import the idefics model from the swarms.models module
 from swarms.models import Idefics
 # Create an instance of the idefics model
 model = Idefics()
 # Define user input with an image URL and chat with the model
 user_input = (
    "User: What is in this image?"
    " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
 )
 response = model.chat(user_input)
 print(response)
 # Define another user input with an image URL and chat with the model
 user_input = (
    "User: And who is that?"
    " https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"
 )
 response = model.chat(user_input)
 print(response)
 # Set the checkpoint of the model to "new_checkpoint"
 model.set_checkpoint("new_checkpoint")
 # Set the device of the model to "cpu"
 model.set_device("cpu")
 # Set the maximum length of the chat to 200
 model.set_max_length(200)
 # Clear the chat history of the model
 model.clear_chat_history()
 ```
 ## Radically Simple AI Model APIs
 We provide a vast array of language and multi-modal model APIs for you to generate text, images, music, speech, and even videos. Get started below:
 -----
 ### `Anthropic`
 ```python
@ -900,23 +1009,6 @@ print(image_url)
 ```
 ### `GPT4Vision`
 ```python
 from swarms import GPT4VisionAPI
 # Initialize with default API key and custom max_tokens
 api = GPT4VisionAPI(max_tokens=1000)
 # Define the task and image URL
 task = "Describe the scene in the image."
 img = "https://i.imgur.com/4P4ZRxU.jpeg"
 # Run the GPT-4 Vision model
 response = api.run(task, img)
 # Print the model's response
 print(response)
 ```
 ### Text to Video with `ZeroscopeTTV`
@ -938,7 +1030,7 @@ print(video_path)
 ```
-### ModelScope
+<!-- ### ModelScope
 ```python
 from swarms.models import ModelScopeAutoModel
@ -960,32 +1052,9 @@ cog_agent = CogAgent()
 # Run the model on the tests
 cog_agent.run("Describe this scene", "images/1.jpg")
-```
+``` -->
 ### `QwenVLMultiModal`
 A radically simple interface for QwenVLMultiModal comes complete with Quantization to turn it on just set quantize to true!
 ```python
 from swarms import QwenVLMultiModal
 # Instantiate the QwenVLMultiModal model
 model = QwenVLMultiModal(
    model_name="Qwen/Qwen-VL-Chat",
    device="cuda",
    quantize=True,
 )
 # Run the model
 response = model(
    "Hello, how are you?", "https://example.com/image.jpg"
 )
 # Print the response
 print(response)
 ```
 ----
--- a/playground/models/idefics.py
+++ b/playground/models/idefics.py
@ -1,7 +1,10 @@
-from swarms.models import idefics
+# Import the idefics model from the swarms.models module
 from swarms.models import Idefics
-model = idefics()
+# Create an instance of the idefics model
 model = Idefics()
 # Define user input with an image URL and chat with the model
 user_input = (
    "User: What is in this image?"
    " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
@ -9,6 +12,7 @@ user_input = (
 response = model.chat(user_input)
 print(response)
 # Define another user input with an image URL and chat with the model
 user_input = (
    "User: And who is that?"
    " https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"
@ -16,7 +20,14 @@ user_input = (
 response = model.chat(user_input)
 print(response)
 # Set the checkpoint of the model to "new_checkpoint"
 model.set_checkpoint("new_checkpoint")
 # Set the device of the model to "cpu"
 model.set_device("cpu")
 # Set the maximum length of the chat to 200
 model.set_max_length(200)
 # Clear the chat history of the model
 model.clear_chat_history()
--- a/playground/models/kosmos.py
+++ b/playground/models/kosmos.py
@ -0,0 +1,10 @@
 from swarms import Kosmos
 # Initialize the model
 model = Kosmos()
 # Generate
 out = model.run("Analyze the reciepts in this image", "docs.jpg")
 # Print the output
 print(out)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "swarms"
-version = "3.8.1"
+version = "3.8.2"
 description = "Swarms - Pytorch"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -48,6 +48,8 @@ from swarms.models.vip_llava import VipLlavaMultiModal  # noqa: E402
 from swarms.models.llava import LavaMultiModal  # noqa: E402
 from swarms.models.qwen import QwenVLMultiModal  # noqa: E402
 from swarms.models.clipq import CLIPQ  # noqa: E402
 from swarms.models.kosmos_two import Kosmos  # noqa: E402   
 from swarms.models.fuyu import Fuyu  # noqa: E402
 # from swarms.models.dalle3 import Dalle3
 # from swarms.models.distilled_whisperx import DistilWhisperModel # noqa: E402
@ -79,7 +81,6 @@ __all__ = [
    "Zephyr",
    "BaseMultiModalModel",
    "Idefics",
    # "Kosmos",
    "Vilt",
    "Nougat",
    "LayoutLMDocumentQA",
@ -102,9 +103,6 @@ __all__ = [
    "AudioModality",
    "VideoModality",
    "MultimodalData",
    # "CogAgent",
    # "ModelScopePipeline",
    # "ModelScopeAutoModel",
    "TogetherLLM",
    "TimmModel",
    "UltralyticsModel",
@ -112,4 +110,6 @@ __all__ = [
    "LavaMultiModal",
    "QwenVLMultiModal",
    "CLIPQ",
    "Kosmos",
    "Fuyu",
 ]