diff --git a/swarms/agents/message.py b/swarms/agents/message.py new file mode 100644 index 00000000..f1eabab1 --- /dev/null +++ b/swarms/agents/message.py @@ -0,0 +1,24 @@ +import datetime + +class Message: + """ + Represents a message with timestamp and optional metadata. + + Usage + -------------- + mes = Message( + sender = "Kye", + content = "message" + ) + + print(mes) + """ + + def __init__(self, sender, content, metadata=None): + self.timestamp = datetime.datetime.now() + self.sender = sender + self.content = content + self.metadata = metadata or {} + + def __repr__(self): + return f"{self.timestamp} - {self.sender}: {self.content}" diff --git a/swarms/agents/multi_modal_agent.py b/swarms/agents/multi_modal_agent.py index bcaf62d4..0d8103e9 100644 --- a/swarms/agents/multi_modal_agent.py +++ b/swarms/agents/multi_modal_agent.py @@ -1,33 +1,69 @@ -from agent_protocol import Agent, Step, Task +from swarms.agents.muti_modal_workers.multi_modal_agent import MultiModalVisualAgent -from swarms.agents.multi_modal_workers.multi_modal_agent import MultiModalVisualAgent - -class MultiModalVisualAgent: - def __init__( - self, - agent: MultiModalVisualAgent - ): - self.agent = agent - - async def run(self, text: str) -> str: - #run the multi-modal visual agent with the give task - return self.agent.run_text(text) +class MultiModalAgent: + """ + A user-friendly abstraction over the MultiModalVisualAgent that provides a simple interface + to process both text and images. - async def __call__(self, text: str) -> str: - return self.agent.run(text) + Initializes the MultiModalAgent. + + Parameters: + load_dict (dict, optional): Dictionary of class names and devices to load. Defaults to a basic configuration. + temperature (float, optional): Temperature for the OpenAI model. Defaults to 0. + default_language (str, optional): Default language for the agent. Defaults to "English". + + Usage - async def plan(self, step: Step) -> Step: - task = Agent - pass + """ + def __init__( + self, + load_dict, + temperature, + language: str = "english" + ): + self.load_dict = load_dict + self.temperature = temperature + self.langigage = language + + if load_dict is None: + load_dict = { + "ImageCaptioning": "default_device" + } - async def task_handler(self, task: Task): - await self.agent.run() + self.agent = MultiModalVisualAgent( + load_dict, + temperature + ) + self.language = language - async def step_handler(self, step: Step): - if step.name == "plan": - await self.plan(step) - else: - await self.agent.run(step) + def run_text(self, text, language=None): + """Run text through the model""" + + if language is None: + language = self.language - return step + try: + self.agent.init_agent(language) + return self.agent.run_text(text) + except Exception as e: + return f"Error processing text: {str(e)}" + + def run_img(self, image_path: str, language=None): + """If language is None""" + if language is None: + language = self.default_language + + try: + return self.agent.run_image( + image_path, + language + ) + except Exception as error: + return f"Error processing image: {str(error)}" + + def clear(self): + try: + self.agent.clear_memory() + except Exception as e: + return f"Error cleaning memory: {str(e)}"