swarms/swarms/agents/omni_modal_agent.py

from langchain.base_language import BaseLanguageModel
from langchain_experimental.autonomous_agents.hugginggpt.repsonse_generator import (
    load_response_generator,
)
from langchain_experimental.autonomous_agents.hugginggpt.task_executor import (
    TaskExecutor,
)
from langchain_experimental.autonomous_agents.hugginggpt.task_planner import (
    load_chat_planner,
)
from transformers import load_tool

from swarms.structs.agent import Agent
from swarms.utils.loguru_logger import logger


class OmniModalAgent(Agent):
    """
    OmniModalAgent
    LLM -> Plans -> Tasks -> Tools -> Response

    Architecture:
    1. LLM: Language Model
    2. Chat Planner: Plans
    3. Task Executor: Tasks
    4. Tools: Tools

    Args:
        llm (BaseLanguageModel): Language Model
        tools (List[BaseTool]): List of tools

    Returns:
        str: response

    Usage:
    from swarms import OmniModalAgent, OpenAIChat,

    llm = OpenAIChat()
    agent = OmniModalAgent(llm)
    response = agent.run("Hello, how are you? Create an image of how your are doing!")
    """

    def __init__(
        self,
        llm: BaseLanguageModel,
        verbose: bool = False,
        *args,
        **kwargs,
    ):
        super().__init__(llm=llm, *args, **kwargs)
        self.llm = llm
        self.verbose = verbose

        print("Loading tools...")
        self.tools = [
            load_tool(tool_name)
            for tool_name in [
                "document-question-answering",
                "image-captioning",
                "image-question-answering",
                "image-segmentation",
                "speech-to-text",
                "summarization",
                "text-classification",
                "text-question-answering",
                "translation",
                "huggingface-tools/text-to-image",
                "huggingface-tools/text-to-video",
                "text-to-speech",
                "huggingface-tools/text-download",
                "huggingface-tools/image-transformation",
            ]
        ]

        # Load the chat planner and response generator
        self.chat_planner = load_chat_planner(llm)
        self.response_generator = load_response_generator(llm)
        self.task_executor = TaskExecutor
        self.history = []

    def run(self, task: str) -> str:
        """Run the OmniAgent"""
        try:
            plan = self.chat_planner.plan(
                inputs={
                    "input": task,
                    "hf_tools": self.tools,
                }
            )
            self.task_executor = TaskExecutor(plan)
            self.task_executor.run()

            response = self.response_generator.generate(
                {"task_execution": self.task_executor}
            )

            return response
        except Exception as error:
            logger.error(f"Error running the agent: {error}")
            return f"Error running the agent: {error}"