diff --git a/pyproject.toml b/pyproject.toml
index 0871ddaa..f0551c4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,9 +68,9 @@ rich = "13.5.2"
 sqlalchemy = "*"
 pgvector = "*"
 qdrant-client = "*"
-vllm = "*"
 sentence-transformers = "*"
 peft = "*"
+modelscope = "1.10.0"
 
 
 [tool.poetry.group.lint.dependencies]
diff --git a/requirements.txt b/requirements.txt
index 9944b616..238b32fb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -75,3 +75,4 @@ qdrant-client
 vllm
 sentence-transformers
 peft
+modelscope==1.10.0
\ No newline at end of file
diff --git a/swarms/agents/tool_agent.py b/swarms/agents/tool_agent.py
index 594a1863..bc34a476 100644
--- a/swarms/agents/tool_agent.py
+++ b/swarms/agents/tool_agent.py
@@ -61,6 +61,7 @@ class ToolAgent(AbstractLLM):
 
         print(generated_data)
     """
+
     def __init__(
         self,
         name: str,
@@ -108,7 +109,7 @@ class ToolAgent(AbstractLLM):
         except Exception as error:
             print(f"[Error] [ToolAgent] {error}")
             raise error
-    
+
     def __call__(self, task: str, *args, **kwargs):
         """Call self as a function.
 
@@ -118,4 +119,4 @@ class ToolAgent(AbstractLLM):
         Returns:
             _type_: _description_
         """
-        return self.run(task, *args, **kwargs)
\ No newline at end of file
+        return self.run(task, *args, **kwargs)
diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py
index 58701f64..640a7297 100644
--- a/swarms/models/__init__.py
+++ b/swarms/models/__init__.py
@@ -9,7 +9,7 @@ from swarms.models.openai_models import (
     OpenAIChat,
 )  # noqa: E402
 
-from swarms.models.vllm import vLLM  # noqa: E402
+# from swarms.models.vllm import vLLM  # noqa: E402
 from swarms.models.zephyr import Zephyr  # noqa: E402
 from swarms.models.biogpt import BioGPT  # noqa: E402
 from swarms.models.huggingface import HuggingfaceLLM  # noqa: E402
@@ -72,7 +72,7 @@ __all__ = [
     # "Dalle3",
     # "DistilWhisperModel",
     "GPT4VisionAPI",
-    "vLLM",
+    # "vLLM",
     "OpenAITTS",
     "Gemini",
     "Gigabind",
diff --git a/swarms/models/cog_agent.py b/swarms/models/cog_agent.py
new file mode 100644
index 00000000..7a3ff684
--- /dev/null
+++ b/swarms/models/cog_agent.py
@@ -0,0 +1,128 @@
+import torch
+from PIL import Image
+from modelscope import AutoModelForCausalLM, AutoTokenizer
+from swarms.models.base_multimodal_model import BaseMultiModalModel
+
+device_check = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+class CogAgent(BaseMultiModalModel):
+    """CogAgent
+    
+    Multi-modal conversational agent that can be used to chat with
+    images and text. It is based on the CogAgent model from the
+    ModelScope library.
+    
+    Attributes:
+        model_name (str): The name of the model to be used
+        tokenizer_name (str): The name of the tokenizer to be used
+        dtype (torch.bfloat16): The data type to be used
+        low_cpu_mem_usage (bool): Whether to use low CPU memory
+        load_in_4bit (bool): Whether to load in 4-bit
+        trust_remote_code (bool): Whether to trust remote code
+        device (str): The device to be used
+    
+    Examples:
+        >>> from swarms.models.cog_agent import CogAgent
+        >>> cog_agent = CogAgent()
+        >>> cog_agent.run("How are you?", "images/1.jpg")
+        <s> I'm fine. How are you? </s>
+    """
+    def __init__(
+        self,
+        model_name: str = "ZhipuAI/cogagent-chat",
+        tokenizer_name: str = "I-ModelScope/vicuna-7b-v1.5",
+        dtype=torch.bfloat16,
+        low_cpu_mem_usage: bool = True,
+        load_in_4bit: bool = True,
+        trust_remote_code: bool = True,
+        device=device_check,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.model_name = model_name
+        self.tokenizer_name = tokenizer_name
+        self.dtype = dtype
+        self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.load_in_4bit = load_in_4bit
+        self.trust_remote_code = trust_remote_code
+        self.device = device
+
+        self.model = (
+            AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                torch_dtype=self.dtype,
+                low_cpu_mem_usage=self.low_cpu_mem_usage,
+                load_in_4bit=self.load_in_4bit,
+                trust_remote_code=self.trust_remote_code,
+                *args,
+                **kwargs,
+            )
+            .to(self.device)
+            .eval()
+        )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer_name
+        )
+
+    def run(self, task: str, img: str, *args, **kwargs):
+        """Run the model
+
+        Args:
+            task (str): The task to be performed
+            img (str): The image path
+            
+        """ 
+        image = Image.open(img).convert("RGB")
+
+        input_by_model = self.model.build_conversation_input_ids(
+            self.tokenizer,
+            query=task,
+            history=[],
+            images=[image],
+        )
+
+        inputs = {
+            "input_ids": (
+                input_by_model["input_ids"]
+                .unsqueeze(0)
+                .to(self.device)
+            ),
+            "token_type_ids": (
+                input_by_model["token_type_ids"]
+                .unsqueeze(0)
+                .to(self.device)
+            ),
+            "attention_mask": (
+                input_by_model["attention_mask"]
+                .unsqueeze(0)
+                .to(self.device)
+            ),
+            "images": [
+                [
+                    input_by_model["images"][0]
+                    .to(self.device)
+                    .to(self.dtype)
+                ]
+            ],
+        }
+        if (
+            "cross_images" in input_by_model
+            and input_by_model["cross_images"]
+        ):
+            inputs["cross_images"] = [
+                [
+                    input_by_model["cross_images"][0]
+                    .to(self.device)
+                    .to(self.dtype)
+                ]
+            ]
+
+        with torch.no_grad():
+            outputs = self.model(**inputs, **kwargs)
+            outputs = outputs[:, inputs["input_ids"].shape[1] :]
+            response = self.decode(outputs[0])
+            response = response.split("</s>")[0]
+            print(response)