From d08f61dbf1bd7e2b614fc22246038ecff48d0f56 Mon Sep 17 00:00:00 2001 From: Kye Date: Wed, 4 Oct 2023 15:42:42 -0400 Subject: [PATCH] omnimodal agent Former-commit-id: e1e7bd8988999be3f4d5ad551776b8c3a83adf82 --- README.md | 3 +- docs/swarms/agents/omni_agent.md | 61 +++++++++++++++++++++++++++++++ mkdocs.yml | 1 + swarms/agents/omni_modal_agent.py | 28 -------------- 4 files changed, 64 insertions(+), 29 deletions(-) create mode 100644 docs/swarms/agents/omni_agent.md diff --git a/README.md b/README.md index 5d0e8dac..d54424e3 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,8 @@ We're hiring: Engineers, Researchers, Interns And, salesprofessionals to work on ## Usage We have a small gallery of examples to run here, [for more check out the docs to build your own agent and or swarms!](https://docs.apac.ai) -### `` +### `MultiAgentDebate` + - `MultiAgentDebate` is a simple class that enables multi agent collaboration. ```python diff --git a/docs/swarms/agents/omni_agent.md b/docs/swarms/agents/omni_agent.md new file mode 100644 index 00000000..bb2dd849 --- /dev/null +++ b/docs/swarms/agents/omni_agent.md @@ -0,0 +1,61 @@ +# `OmniModalAgent` Documentation + +## Overview & Architectural Analysis +The `OmniModalAgent` class is at the core of an architecture designed to facilitate dynamic interactions using various tools, through a seamless integration of planning, task execution, and response generation mechanisms. It encompasses multiple modalities including natural language processing, image processing, and more, aiming to provide comprehensive and intelligent responses. + +### Architectural Components: +1. **LLM (Language Model)**: It acts as the foundation, underpinning the understanding and generation of language-based interactions. +2. **Chat Planner**: This component drafts a blueprint for the steps necessary based on the user's input. +3. **Task Executor**: As the name suggests, it's responsible for executing the formulated tasks. +4. **Tools**: A collection of tools and utilities used to process different types of tasks. They span across areas like image captioning, translation, and more. + +## Structure & Organization + +### Table of Contents: +1. Introduction +2. Architectural Analysis +3. Methods + - Initialization (`__init__`) + - Agent Runner (`run`) +4. Usage Examples +5. Error Messages & Exception Handling +6. Summary + +### Methods + +#### Initialization (`__init__`): +This method initializes the agent with a given language model and loads a plethora of tools. +Parameters: +- **llm (BaseLanguageModel)**: The language model for the agent. + +During initialization, various tools like "document-question-answering", "image-captioning", and more are loaded. + +#### Agent Runner (`run`): +This method represents the primary operation of the OmniModalAgent. It takes an input, devises a plan using the chat planner, executes the plan with the task executor, and finally, the response generator crafts a response based on the tasks executed. + +Parameters: +- **input (str)**: The input string provided by the user. + +Returns: +- **response (str)**: The generated response after executing the plan. + +## Examples & Use Cases + +### Usage: +```python +from swarms import OmniModalAgent, OpenAIChat + +llm = OpenAIChat() +agent = OmniModalAgent(llm) +response = agent.run("Hello, how are you? Create an image of how you are doing!") +print(response) +``` +This example showcases the instantiation of the OmniModalAgent with a language model and then running the agent with a sample input. + +## Error Messages & Exception Handling +Currently, the provided code does not specify particular errors or exceptions. However, future iterations might include error handling mechanisms to cater to issues like tool loading failures, task execution errors, etc. + +## Summary +The `OmniModalAgent` is a robust framework designed to assimilate multiple tools and processes into a singular architecture. It aids in understanding, planning, executing, and responding to user inputs in a comprehensive manner. Developers aiming to integrate advanced interactions spanning multiple domains will find this class invaluable. + +For further details on the internal tools and modules like `BaseLanguageModel`, `TaskExecutor`, etc., refer to their respective documentation. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 8983fcc2..5611e6b0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -85,6 +85,7 @@ nav: - Overview: "swarms/models/index.md" - HuggingFaceLLM: "swarms/models/hf.md" - Anthropic: "swarms/models/anthropic.md" + - OmniModalAgent: "swarms/agents/omni_agent.md" - Examples: - Overview: "examples/index.md" - Agents: diff --git a/swarms/agents/omni_modal_agent.py b/swarms/agents/omni_modal_agent.py index ee51ee6c..80eac09b 100644 --- a/swarms/agents/omni_modal_agent.py +++ b/swarms/agents/omni_modal_agent.py @@ -13,34 +13,6 @@ from langchain_experimental.autonomous_agents.hugginggpt.task_planner import ( ) from transformers import load_tool -# from swarms.agents.multi_modal_workers.omni_agent.omni_chat import chat_huggingface -# class OmniModalAgent: -# def __init__( -# self, -# api_key, -# api_endpoint, -# api_type -# ): -# self.api_key = api_key -# self.api_endpoint = api_endpoint -# self.api_type = api_type - -# def chat( -# self, -# data -# ): -# """Chat with omni-modality model that uses huggingface to query for a specific model at run time. Translate text to speech, create images and more""" -# messages = data.get("messages") -# api_key = data.get("api_key", self.api_key) -# api_endpoint = data.get("api_endpoint", self.api_endpoint) -# api_type = data.get("api_type", self.api_type) - -# if not(api_key and api_type and api_endpoint): -# # raise ValueError("Please provide api_key, api_type, and api_endpoint") - -# # response = chat_huggingface(messages, api_key, api_type, api_endpoint) -# # return response - class Step: def __init__( self,