diff --git a/mm_agent_example.py b/mm_agent_example.py index edd1e397..ca1e6051 100644 --- a/mm_agent_example.py +++ b/mm_agent_example.py @@ -1,6 +1,10 @@ from swarms.agents import MultiModalAgent -node = MultiModalAgent() +load_dict = { + "ImageCaptioning": "default_device" +} + +node = MultiModalAgent(load_dict) text = node.run_text("What is your name? Generate a picture of yourself") diff --git a/swarms/agents/multi_modal_visual_agent.py b/swarms/agents/multi_modal_visual_agent.py index 454ee8fb..2ea3e5c3 100644 --- a/swarms/agents/multi_modal_visual_agent.py +++ b/swarms/agents/multi_modal_visual_agent.py @@ -1637,9 +1637,9 @@ class MultiModalAgent: self.langigage = language # if load_dict is None: - load_dict = { - "ImageCaptioning": "default_device" - } + # self.load_dict = { + # "ImageCaptioning": "default_device" + # } self.agent = MultiModalVisualAgent( load_dict,