diff --git a/pyproject.toml b/pyproject.toml index 0871ddaa..f0551c4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,9 +68,9 @@ rich = "13.5.2" sqlalchemy = "*" pgvector = "*" qdrant-client = "*" -vllm = "*" sentence-transformers = "*" peft = "*" +modelscope = "1.10.0" [tool.poetry.group.lint.dependencies] diff --git a/requirements.txt b/requirements.txt index 9944b616..238b32fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -75,3 +75,4 @@ qdrant-client vllm sentence-transformers peft +modelscope==1.10.0 \ No newline at end of file diff --git a/swarms/agents/tool_agent.py b/swarms/agents/tool_agent.py index 594a1863..bc34a476 100644 --- a/swarms/agents/tool_agent.py +++ b/swarms/agents/tool_agent.py @@ -61,6 +61,7 @@ class ToolAgent(AbstractLLM): print(generated_data) """ + def __init__( self, name: str, @@ -108,7 +109,7 @@ class ToolAgent(AbstractLLM): except Exception as error: print(f"[Error] [ToolAgent] {error}") raise error - + def __call__(self, task: str, *args, **kwargs): """Call self as a function. @@ -118,4 +119,4 @@ class ToolAgent(AbstractLLM): Returns: _type_: _description_ """ - return self.run(task, *args, **kwargs) \ No newline at end of file + return self.run(task, *args, **kwargs) diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py index 58701f64..640a7297 100644 --- a/swarms/models/__init__.py +++ b/swarms/models/__init__.py @@ -9,7 +9,7 @@ from swarms.models.openai_models import ( OpenAIChat, ) # noqa: E402 -from swarms.models.vllm import vLLM # noqa: E402 +# from swarms.models.vllm import vLLM # noqa: E402 from swarms.models.zephyr import Zephyr # noqa: E402 from swarms.models.biogpt import BioGPT # noqa: E402 from swarms.models.huggingface import HuggingfaceLLM # noqa: E402 @@ -72,7 +72,7 @@ __all__ = [ # "Dalle3", # "DistilWhisperModel", "GPT4VisionAPI", - "vLLM", + # "vLLM", "OpenAITTS", "Gemini", "Gigabind", diff --git a/swarms/models/cog_agent.py b/swarms/models/cog_agent.py new file mode 100644 index 00000000..7a3ff684 --- /dev/null +++ b/swarms/models/cog_agent.py @@ -0,0 +1,128 @@ +import torch +from PIL import Image +from modelscope import AutoModelForCausalLM, AutoTokenizer +from swarms.models.base_multimodal_model import BaseMultiModalModel + +device_check = "cuda" if torch.cuda.is_available() else "cpu" + + +class CogAgent(BaseMultiModalModel): + """CogAgent + + Multi-modal conversational agent that can be used to chat with + images and text. It is based on the CogAgent model from the + ModelScope library. + + Attributes: + model_name (str): The name of the model to be used + tokenizer_name (str): The name of the tokenizer to be used + dtype (torch.bfloat16): The data type to be used + low_cpu_mem_usage (bool): Whether to use low CPU memory + load_in_4bit (bool): Whether to load in 4-bit + trust_remote_code (bool): Whether to trust remote code + device (str): The device to be used + + Examples: + >>> from swarms.models.cog_agent import CogAgent + >>> cog_agent = CogAgent() + >>> cog_agent.run("How are you?", "images/1.jpg") + I'm fine. How are you? + """ + def __init__( + self, + model_name: str = "ZhipuAI/cogagent-chat", + tokenizer_name: str = "I-ModelScope/vicuna-7b-v1.5", + dtype=torch.bfloat16, + low_cpu_mem_usage: bool = True, + load_in_4bit: bool = True, + trust_remote_code: bool = True, + device=device_check, + *args, + **kwargs, + ): + super().__init__() + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.dtype = dtype + self.low_cpu_mem_usage = low_cpu_mem_usage + self.load_in_4bit = load_in_4bit + self.trust_remote_code = trust_remote_code + self.device = device + + self.model = ( + AutoModelForCausalLM.from_pretrained( + self.model_name, + torch_dtype=self.dtype, + low_cpu_mem_usage=self.low_cpu_mem_usage, + load_in_4bit=self.load_in_4bit, + trust_remote_code=self.trust_remote_code, + *args, + **kwargs, + ) + .to(self.device) + .eval() + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + self.tokenizer_name + ) + + def run(self, task: str, img: str, *args, **kwargs): + """Run the model + + Args: + task (str): The task to be performed + img (str): The image path + + """ + image = Image.open(img).convert("RGB") + + input_by_model = self.model.build_conversation_input_ids( + self.tokenizer, + query=task, + history=[], + images=[image], + ) + + inputs = { + "input_ids": ( + input_by_model["input_ids"] + .unsqueeze(0) + .to(self.device) + ), + "token_type_ids": ( + input_by_model["token_type_ids"] + .unsqueeze(0) + .to(self.device) + ), + "attention_mask": ( + input_by_model["attention_mask"] + .unsqueeze(0) + .to(self.device) + ), + "images": [ + [ + input_by_model["images"][0] + .to(self.device) + .to(self.dtype) + ] + ], + } + if ( + "cross_images" in input_by_model + and input_by_model["cross_images"] + ): + inputs["cross_images"] = [ + [ + input_by_model["cross_images"][0] + .to(self.device) + .to(self.dtype) + ] + ] + + with torch.no_grad(): + outputs = self.model(**inputs, **kwargs) + outputs = outputs[:, inputs["input_ids"].shape[1] :] + response = self.decode(outputs[0]) + response = response.split("")[0] + print(response)