[FEAT][Modelscope] [CogAgent] [REMOVAL][Vllm] [v]

2 years ago · e62c665a82
parent 95967e311e
commit e62c665a82
5 changed files with 135 additions and 5 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -68,9 +68,9 @@ rich = "13.5.2"
 sqlalchemy = "*"
 pgvector = "*"
 qdrant-client = "*"
 vllm = "*"
 sentence-transformers = "*"
 peft = "*"
 modelscope = "1.10.0"
 [tool.poetry.group.lint.dependencies]
--- a/requirements.txt
+++ b/requirements.txt
@ -75,3 +75,4 @@ qdrant-client
 vllm
 sentence-transformers
 peft
 modelscope==1.10.0
--- a/swarms/agents/tool_agent.py
+++ b/swarms/agents/tool_agent.py
@ -61,6 +61,7 @@ class ToolAgent(AbstractLLM):
        print(generated_data)
    """
    def __init__(
        self,
        name: str,
@ -108,7 +109,7 @@ class ToolAgent(AbstractLLM):
        except Exception as error:
            print(f"[Error] [ToolAgent] {error}")
            raise error
-    
+
    def __call__(self, task: str, *args, **kwargs):
        """Call self as a function.
@ -118,4 +119,4 @@ class ToolAgent(AbstractLLM):
        Returns:
            _type_: _description_
        """
-        return self.run(task, *args, **kwargs)
+        return self.run(task, *args, **kwargs)
--- a/swarms/models/init.py
+++ b/swarms/models/init.py
@ -9,7 +9,7 @@ from swarms.models.openai_models import (
    OpenAIChat,
 )  # noqa: E402
-from swarms.models.vllm import vLLM  # noqa: E402
+# from swarms.models.vllm import vLLM  # noqa: E402
 from swarms.models.zephyr import Zephyr  # noqa: E402
 from swarms.models.biogpt import BioGPT  # noqa: E402
 from swarms.models.huggingface import HuggingfaceLLM  # noqa: E402
@ -72,7 +72,7 @@ __all__ = [
    # "Dalle3",
    # "DistilWhisperModel",
    "GPT4VisionAPI",
-    "vLLM",
+    # "vLLM",
    "OpenAITTS",
    "Gemini",
    "Gigabind",
--- a/swarms/models/cog_agent.py
+++ b/swarms/models/cog_agent.py
@ -0,0 +1,128 @@
 import torch
 from PIL import Image
 from modelscope import AutoModelForCausalLM, AutoTokenizer
 from swarms.models.base_multimodal_model import BaseMultiModalModel
 device_check = "cuda" if torch.cuda.is_available() else "cpu"
 class CogAgent(BaseMultiModalModel):
    """CogAgent
    Multi-modal conversational agent that can be used to chat with
    images and text. It is based on the CogAgent model from the
    ModelScope library.
    Attributes:
        model_name (str): The name of the model to be used
        tokenizer_name (str): The name of the tokenizer to be used
        dtype (torch.bfloat16): The data type to be used
        low_cpu_mem_usage (bool): Whether to use low CPU memory
        load_in_4bit (bool): Whether to load in 4-bit
        trust_remote_code (bool): Whether to trust remote code
        device (str): The device to be used
    Examples:
        >>> from swarms.models.cog_agent import CogAgent
        >>> cog_agent = CogAgent()
        >>> cog_agent.run("How are you?", "images/1.jpg")
        <s> I'm fine. How are you? </s>
    """
    def __init__(
        self,
        model_name: str = "ZhipuAI/cogagent-chat",
        tokenizer_name: str = "I-ModelScope/vicuna-7b-v1.5",
        dtype=torch.bfloat16,
        low_cpu_mem_usage: bool = True,
        load_in_4bit: bool = True,
        trust_remote_code: bool = True,
        device=device_check,
        *args,
        **kwargs,
    ):
        super().__init__()
        self.model_name = model_name
        self.tokenizer_name = tokenizer_name
        self.dtype = dtype
        self.low_cpu_mem_usage = low_cpu_mem_usage
        self.load_in_4bit = load_in_4bit
        self.trust_remote_code = trust_remote_code
        self.device = device
        self.model = (
            AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=self.dtype,
                low_cpu_mem_usage=self.low_cpu_mem_usage,
                load_in_4bit=self.load_in_4bit,
                trust_remote_code=self.trust_remote_code,
                *args,
                **kwargs,
            )
            .to(self.device)
            .eval()
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.tokenizer_name
        )
    def run(self, task: str, img: str, *args, **kwargs):
        """Run the model
        Args:
            task (str): The task to be performed
            img (str): The image path
        """ 
        image = Image.open(img).convert("RGB")
        input_by_model = self.model.build_conversation_input_ids(
            self.tokenizer,
            query=task,
            history=[],
            images=[image],
        )
        inputs = {
            "input_ids": (
                input_by_model["input_ids"]
                .unsqueeze(0)
                .to(self.device)
            ),
            "token_type_ids": (
                input_by_model["token_type_ids"]
                .unsqueeze(0)
                .to(self.device)
            ),
            "attention_mask": (
                input_by_model["attention_mask"]
                .unsqueeze(0)
                .to(self.device)
            ),
            "images": [
                [
                    input_by_model["images"][0]
                    .to(self.device)
                    .to(self.dtype)
                ]
            ],
        }
        if (
            "cross_images" in input_by_model
            and input_by_model["cross_images"]
        ):
            inputs["cross_images"] = [
                [
                    input_by_model["cross_images"][0]
                    .to(self.device)
                    .to(self.dtype)
                ]
            ]
        with torch.no_grad():
            outputs = self.model(**inputs, **kwargs)
            outputs = outputs[:, inputs["input_ids"].shape[1] :]
            response = self.decode(outputs[0])
            response = response.split("</s>")[0]
            print(response)