From 8adfd438fe4dc8aa8950f619ae95f71683e2f730 Mon Sep 17 00:00:00 2001
From: Kye <kye@apacmediasolutions.com>
Date: Tue, 3 Oct 2023 15:14:10 -0400
Subject: [PATCH] omnimodal agent

Former-commit-id: 25c65062f7d4a85c0e4aa6c617bb5a13659954e1
---
 swarms/agents/multi_modal_visual_agent.py |   1 +
 swarms/agents/omni_modal_agent.py         | 122 +++++++++++++++++++++-
 2 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/swarms/agents/multi_modal_visual_agent.py b/swarms/agents/multi_modal_visual_agent.py
index 871f90d0..1e8b1e3d 100644
--- a/swarms/agents/multi_modal_visual_agent.py
+++ b/swarms/agents/multi_modal_visual_agent.py
@@ -1586,6 +1586,7 @@ class MultiModalVisualAgent:
 ###### usage
 from swarms.agents.message import Message
 
+
 class MultiModalAgent:
     """
     A user-friendly abstraction over the MultiModalVisualAgent that provides a simple interface 
diff --git a/swarms/agents/omni_modal_agent.py b/swarms/agents/omni_modal_agent.py
index 025e3517..263a7778 100644
--- a/swarms/agents/omni_modal_agent.py
+++ b/swarms/agents/omni_modal_agent.py
@@ -1,6 +1,19 @@
-from swarms.agents.multi_modal_workers.omni_agent.omni_chat import chat_huggingface
+from typing import Dict, List
 
+from langchain.base_language import BaseLanguageModel
+from langchain.tools.base import BaseTool
+from langchain_experimental.autonomous_agents.hugginggpt.repsonse_generator import (
+    load_response_generator,
+)
+from langchain_experimental.autonomous_agents.hugginggpt.task_executor import (
+    TaskExecutor,
+)
+from langchain_experimental.autonomous_agents.hugginggpt.task_planner import (
+    load_chat_planner,
+)
+from transformers import load_tool
 
+# from swarms.agents.multi_modal_workers.omni_agent.omni_chat import chat_huggingface
 # class OmniModalAgent:
 #     def __init__(
 #         self, 
@@ -28,9 +41,110 @@ from swarms.agents.multi_modal_workers.omni_agent.omni_chat import chat_huggingf
 #         response = chat_huggingface(messages, api_key, api_type, api_endpoint)
 #         return response
 
+class Step:
+    def __init__(
+        self,
+        task: str,
+        id: int,
+        dep: List[int],
+        args: Dict[str, str],
+        tool: BaseTool
+    ):
+        self.task = task
+        self.id = id
+        self.dep = dep
+        self.args = args
+        self.tool = tool
 
+class Plan:
+    def __init__(
+        self,
+        steps: List[Step]
+    ):
+        self.steps = steps
+    
+    def __str__(self) -> str:
+        return str([str(step) for step in self.steps])
+    
+    def __repr(self) -> str:
+        return str(self)
+
+
+
+
+
+class OmniModalAgent:
+    """
+    OmniModalAgent
+    LLM -> Plans -> Tasks -> Tools -> Response
+
+    Architecture:
+    1. LLM: Language Model
+    2. Chat Planner: Plans
+    3. Task Executor: Tasks
+    4. Tools: Tools
+
+    Args:
+        llm (BaseLanguageModel): Language Model
+        tools (List[BaseTool]): List of tools
+
+    Returns:
+        str: response
+
+    Usage:
+    from swarms import OmniModalAgent, OpenAIChat,
+
+    llm = OpenAIChat()
+    agent = OmniModalAgent(llm)
+    response = agent.run("Hello, how are you? Create an image of how your are doing!")
+    """
+    def __init__(
+        self,
+        llm: BaseLanguageModel,
+        tools: List[BaseTool]
+    ):
+        self.llm = llm
+        self.tools = tools
+        self.chat_planner = load_chat_planner(llm)
+        self.response_generator = load_response_generator(llm)
+        self.task_executor: TaskExecutor
+
+    def tools_init(self):
+        self.tools = [
+            load_tool(tool_name)
+            for tool_name in [
+                "document-question-answering",
+                "image-captioning",
+                "image-question-answering",
+                "image-segmentation",
+                "speech-to-text",
+                "summarization",
+                "text-classification",
+                "text-question-answering",
+                "translation",
+                "huggingface-tools/text-to-image",
+                "huggingface-tools/text-to-video",
+                "text-to-speech",
+                "huggingface-tools/text-download",
+                "huggingface-tools/image-transformation",
+            ]
+        ]
+    
+
+    def run(self, input: str) -> str:
+        plan = self.chat_planner.plan(
+            inputs={
+                "input": input,
+                "hf_tools": self.tools,
+            }
+        )
+        self.task_executor = TaskExecutor(plan)
+        self.task_executor.run()
+
+        response = self.response_generator.generate(
+            {"task_execution": self.task_executor}
+        )
+
+        return response
 
-# class OmniModalAgent:
-#     def __init__(
 
-#     )
\ No newline at end of file