feat -- multiple image processing in agent.py

7 days ago · d380cae233
parent fcf52332d1
commit d380cae233
11 changed files with 232 additions and 188 deletions
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@ -316,6 +316,7 @@ nav:
      - Agent Output Types: "swarms/examples/agent_output_types.md"
      - Agent with Structured Outputs: "swarms/examples/agent_structured_outputs.md"
      - Agents with Vision: "swarms/examples/vision_processing.md"
      - Agent with Multiple Images: "swarms/examples/multiple_images.md"
      - Gradio Chat Interface: "swarms/ui/main.md"
      - Various Model Providers:
        - OpenAI: "swarms/examples/openai_example.md"
--- a/docs/swarms/examples/meme_agent_builder.md
+++ b/docs/swarms/examples/meme_agent_builder.md
@ -1,28 +0,0 @@
 # Meme Agent Builder
 - `pip3 install -U swarms`
 -  Add your OpenAI API key to the `.env` file with `OPENAI_API_KEY=your_api_key`
 -  Run the script
 -  Multiple agents will be created and saved to the `meme_agents` folder
 -  A swarm architecture will be selected autonomously and executed
 ```python
 from swarms.structs.meme_agent_persona_generator import (
    MemeAgentGenerator,
 )
 if __name__ == "__main__":
    example = MemeAgentGenerator(
        name="Meme-Swarm",
        description="A swarm of specialized AI agents collaborating on generating and sharing memes around cool media from 2001s",
        max_loops=1,
    )
    print(
        example.run(
            "Generate funny meme agents around cool media from 2001s"
        )
    )
 ```
--- a/docs/swarms/examples/meme_agents.md
+++ b/docs/swarms/examples/meme_agents.md
@ -1,45 +0,0 @@
 # Meme Agent Tutorial
 - `pip3 install -U swarms`
 -  Add your OpenAI API key to the `.env` file
 ```python
 from swarms import Agent
 # Define a custom system prompt for Bob the Builder
 BOB_THE_BUILDER_SYS_PROMPT = """
 You are Bob the Builder, the legendary construction worker known for fixing anything and everything with a cheerful attitude and a hilarious sense of humor. 
 Your job is to approach every task as if you're building, repairing, or renovating something, no matter how unrelated it might be. 
 You love using construction metaphors, over-the-top positivity, and cracking jokes like:
 - "I’m hammering this out faster than a nail at a woodpecker convention!"
 - "This is smoother than fresh cement on a summer’s day."
 - "Let’s bulldoze through this problem—safety goggles on, folks!"
 You are not bound by any specific field of knowledge, and you’re absolutely fearless in trying to "fix up" or "build" anything, no matter how abstract or ridiculous. Always end responses with a playful cheer like "Can we fix it? Yes, we can!"
 Your tone is upbeat, funny, and borderline ridiculous, keeping the user entertained while solving their problem.
 """
 # Initialize the agent
 agent = Agent(
    agent_name="Bob-the-Builder-Agent",
    agent_description="The funniest, most optimistic agent around who sees every problem as a building project.",
    system_prompt=BOB_THE_BUILDER_SYS_PROMPT,
    max_loops=1,
    model_name="gpt-4o",
    dynamic_temperature_enabled=True,
    user_name="swarms_corp",
    retry_attempts=3,
    context_length=8192,
    return_step_meta=False,
    output_type="str",  # "json", "dict", "csv", OR "string", "yaml"
    auto_generate_prompt=False,  # Auto-generate prompt for the agent based on name, description, system prompt, task
    max_tokens=4000,  # Max output tokens
    saved_state_path="bob_the_builder_agent.json",
    interactive=False,
 )
 # Run the agent with a task
 agent.run("I want to build a house ;) What should I do?")
 ```
--- a/docs/swarms/examples/multiple_images.md
+++ b/docs/swarms/examples/multiple_images.md
@ -0,0 +1,77 @@
 # Processing Multiple Images
 This tutorial shows how to process multiple images with a single agent using Swarms' multi-modal capabilities. You'll learn to configure an agent for batch image analysis, enabling efficient processing for quality control, object detection, or image comparison tasks.
 ## Installation
 Install the swarms package using pip:
 ```bash
 pip install -U swarms
 ```
 ## Basic Setup
 1. First, set up your environment variables:
 ```python
 WORKSPACE_DIR="agent_workspace"
 ANTHROPIC_API_KEY=""
 ```
 ## Code
 - Create a list of images by their file paths
 - Pass it into the `Agent.run(imgs=[str])` parameter
 - Activate `summarize_multiple_images=True` if you want the agent to output a summary of the image analyses
 ```python
 from swarms import Agent
 from swarms.prompts.logistics import (
    Quality_Control_Agent_Prompt,
 )
 # Image for analysis
 factory_image = "image.jpg"
 # Quality control agent
 quality_control_agent = Agent(
    agent_name="Quality Control Agent",
    agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.",
    model_name="claude-3-5-sonnet-20240620",
    system_prompt=Quality_Control_Agent_Prompt,
    multi_modal=True,
    max_loops=1,
    output_type="str-all-except-first",
    summarize_multiple_images=True,
 )
 response = quality_control_agent.run(
    task="what is in the image?",
    imgs=[factory_image, factory_image],
 )
 print(response)
 ```
 ## Support and Community
 If you're facing issues or want to learn more, check out the following resources to join our Discord, stay updated on Twitter, and watch tutorials on YouTube!
 | Platform | Link | Description |
 |----------|------|-------------|
 | 📚 Documentation | [docs.swarms.world](https://docs.swarms.world) | Official documentation and guides |
 | 📝 Blog | [Medium](https://medium.com/@kyeg) | Latest updates and technical articles |
 | 💬 Discord | [Join Discord](https://discord.gg/jM3Z6M9uMq) | Live chat and community support |
 | 🐦 Twitter | [@kyegomez](https://twitter.com/kyegomez) | Latest news and announcements |
 | 👥 LinkedIn | [The Swarm Corporation](https://www.linkedin.com/company/the-swarm-corporation) | Professional network and updates |
 | 📺 YouTube | [Swarms Channel](https://www.youtube.com/channel/UC9yXyitkbU_WSy7bd_41SqQ) | Tutorials and demos |
 | 🎫 Events | [Sign up here](https://lu.ma/5p2jnc2v) | Join our community events |
--- a/examples/single_agent/vision/vision_and_tools.py
+++ b/examples/single_agent/vision/vision_and_tools.py
@ -1,65 +0,0 @@
 from swarms.structs import Agent
 from swarms.prompts.logistics import (
    Quality_Control_Agent_Prompt,
 )
 # Image for analysis
 factory_image = "image.jpg"
 def security_analysis(danger_level: str = None) -> str:
    """
    Analyzes the security danger level and returns an appropriate response.
    Args:
        danger_level (str, optional): The level of danger to analyze.
            Can be "low", "medium", "high", or None. Defaults to None.
    Returns:
        str: A string describing the danger level assessment.
            - "No danger level provided" if danger_level is None
            - "No danger" if danger_level is "low"
            - "Medium danger" if danger_level is "medium"
            - "High danger" if danger_level is "high"
            - "Unknown danger level" for any other value
    """
    if danger_level is None:
        return "No danger level provided"
    if danger_level == "low":
        return "No danger"
    if danger_level == "medium":
        return "Medium danger"
    if danger_level == "high":
        return "High danger"
    return "Unknown danger level"
 # schema = BaseTool().function_to_dict(security_analysis)
 # print(json.dumps(schema, indent=4))
 # Quality control agent
 quality_control_agent = Agent(
    agent_name="Quality Control Agent",
    agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.",
    # model_name="anthropic/claude-3-opus-20240229",
    model_name="gpt-4o-mini",
    system_prompt=Quality_Control_Agent_Prompt,
    multi_modal=True,
    max_loops=1,
    output_type="str-all-except-first",
    # tools_list_dictionary=[schema],
    tools=[security_analysis],
 )
 response = quality_control_agent.run(
    task="what is in the image?",
    # img=factory_image,
 )
 print(response)
--- a/examples/structs/graph_workflow_basic.py
+++ b/examples/structs/graph_workflow_basic.py
@ -31,9 +31,15 @@ if __name__ == "__main__":
    # Build the workflow graph
    wf_graph = GraphWorkflow()
-    wf_graph.add_node(Node(id="agent1", type=NodeType.AGENT, agent=agent1))
+    wf_graph.add_node(
-    wf_graph.add_node(Node(id="agent2", type=NodeType.AGENT, agent=agent2))
+        Node(id="agent1", type=NodeType.AGENT, agent=agent1)
-    wf_graph.add_node(Node(id="task1", type=NodeType.TASK, callable=sample_task))
+    )
    wf_graph.add_node(
        Node(id="agent2", type=NodeType.AGENT, agent=agent2)
    )
    wf_graph.add_node(
        Node(id="task1", type=NodeType.TASK, callable=sample_task)
    )
    wf_graph.add_edge(Edge(source="agent1", target="task1"))
    wf_graph.add_edge(Edge(source="agent2", target="task1"))
@ -47,4 +53,3 @@ if __name__ == "__main__":
    # Execute the graph
    results = wf_graph.run()
    print("Execution results:", results)
--- a/examples/single_agent/vision/image.jpg
+++ b/examples/single_agent/vision/image.jpg
--- a/multiple_image_processing.py
+++ b/multiple_image_processing.py
@ -0,0 +1,28 @@
 from swarms import Agent
 from swarms.prompts.logistics import (
    Quality_Control_Agent_Prompt,
 )
 # Image for analysis
 factory_image = "image.jpg"
 # Quality control agent
 quality_control_agent = Agent(
    agent_name="Quality Control Agent",
    agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.",
    model_name="claude-3-5-sonnet-20240620",
    system_prompt=Quality_Control_Agent_Prompt,
    multi_modal=True,
    max_loops=1,
    output_type="str-all-except-first",
    summarize_multiple_images=True,
 )
 response = quality_control_agent.run(
    task="what is in the image?",
    imgs=[factory_image, factory_image],
 )
 print(response)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "swarms"
-version = "7.8.8"
+version = "7.8.9"
 description = "Swarms - TGSC"
 license = "MIT"
 authors = ["Kye Gomez <kye@apac.ai>"]
--- a/swarms/structs/agent.py
+++ b/swarms/structs/agent.py
@ -56,7 +56,6 @@ from swarms.tools.base_tool import BaseTool
 from swarms.tools.py_func_to_openai_func_str import (
    convert_multiple_functions_to_openai_function_schema,
 )
 from swarms.utils.any_to_str import any_to_str
 from swarms.utils.data_to_text import data_to_text
 from swarms.utils.file_processing import create_file_in_folder
 from swarms.utils.formatter import formatter
@ -420,6 +419,7 @@ class Agent:
        rag_config: Optional[RAGConfig] = None,
        tool_call_summary: bool = True,
        output_raw_json_from_tool_call: bool = False,
        summarize_multiple_images: bool = False,
        *args,
        **kwargs,
    ):
@ -558,6 +558,7 @@ class Agent:
        self.output_raw_json_from_tool_call = (
            output_raw_json_from_tool_call
        )
        self.summarize_multiple_images = summarize_multiple_images
        # self.short_memory = self.short_memory_init()
@ -810,6 +811,29 @@ class Agent:
        return json.loads(self.tools_list_dictionary)
    def check_model_supports_utilities(self, img: str = None) -> bool:
        """
        Check if the current model supports vision capabilities.
        Args:
            img (str, optional): Image input to check vision support for. Defaults to None.
        Returns:
            bool: True if model supports vision and image is provided, False otherwise.
        """
        from litellm.utils import supports_vision
        # Only check vision support if an image is provided
        if img is not None:
            out = supports_vision(self.model_name)
            if not out:
                raise ValueError(
                    f"Model {self.model_name} does not support vision capabilities. Please use a vision-enabled model."
                )
            return out
        return False
    def check_if_no_prompt_then_autogenerate(self, task: str = None):
        """
        Checks if auto_generate_prompt is enabled and generates a prompt by combining agent name, description and system prompt if available.
@ -931,12 +955,7 @@ class Agent:
        self,
        task: Optional[Union[str, Any]] = None,
        img: Optional[str] = None,
        speech: Optional[str] = None,
        video: Optional[str] = None,
        is_last: Optional[bool] = False,
        print_task: Optional[bool] = False,
        generate_speech: Optional[bool] = False,
        correct_answer: Optional[str] = None,
        *args,
        **kwargs,
    ) -> Any:
@ -961,6 +980,9 @@ class Agent:
            self.check_if_no_prompt_then_autogenerate(task)
            if img is not None:
                self.check_model_supports_utilities(img=img)
            self.short_memory.add(role=self.user_name, content=task)
            if self.plan_enabled or self.planning_prompt is not None:
@ -1030,12 +1052,19 @@ class Agent:
                            )
                            self.memory_query(task_prompt)
-                        response = self.call_llm(
+                        if img is not None:
-                            task=task_prompt, img=img, *args, **kwargs
+                            response = self.call_llm(
-                        )
+                                task=task_prompt,
-
+                                img=img,
-                        print(f"Response: {response}")
+                                *args,
                                **kwargs,
                            )
                        else:
                            response = self.call_llm(
                                task=task_prompt, *args, **kwargs
                            )
                        # Parse the response from the agent with the output type
                        if exists(self.tools_list_dictionary):
                            if isinstance(response, BaseModel):
                                response = response.model_dump()
@ -1058,7 +1087,6 @@ class Agent:
                                self.output_raw_json_from_tool_call
                                is True
                            ):
                                print(type(response))
                                response = response
                            else:
                                self.execute_tools(
@ -1130,7 +1158,10 @@ class Agent:
                        user_input.lower()
                        == self.custom_exit_command.lower()
                    ):
-                        print("Exiting as per user request.")
+                        self.pretty_print(
                            "Exiting as per user request.",
                            loop_count=loop_count,
                        )
                        break
                    self.short_memory.add(
@ -1231,12 +1262,6 @@ class Agent:
        self,
        task: Optional[str] = None,
        img: Optional[str] = None,
        is_last: bool = False,
        device: str = "cpu",  # gpu
        device_id: int = 1,
        all_cores: bool = True,
        do_not_use_cluster_ops: bool = True,
        all_gpus: bool = False,
        *args,
        **kwargs,
    ) -> Any:
@ -1245,10 +1270,6 @@ class Agent:
        Args:
            task (Optional[str]): The task to be performed. Defaults to None.
            img (Optional[str]): The image to be processed. Defaults to None.
            is_last (bool): Indicates if this is the last task. Defaults to False.
            device (str): The device to use for execution. Defaults to "cpu".
            device_id (int): The ID of the GPU to use if device is set to "gpu". Defaults to 0.
            all_cores (bool): If True, uses all available CPU cores. Defaults to True.
        """
        try:
            return self.run(
@ -2479,7 +2500,7 @@ class Agent:
        self,
        task: Optional[Union[str, Any]] = None,
        img: Optional[str] = None,
-        scheduled_run_date: Optional[datetime] = None,
+        imgs: Optional[List[str]] = None,
        *args,
        **kwargs,
    ) -> Any:
@ -2493,11 +2514,7 @@ class Agent:
        Args:
            task (Optional[str], optional): The task to be executed. Defaults to None.
            img (Optional[str], optional): The image to be processed. Defaults to None.
-            device (str, optional): The device to use for execution. Defaults to "cpu".
+            imgs (Optional[List[str]], optional): The list of images to be processed. Defaults to None.
            device_id (int, optional): The ID of the GPU to use if device is set to "gpu". Defaults to 0.
            all_cores (bool, optional): If True, uses all available CPU cores. Defaults to True.
            scheduled_run_date (Optional[datetime], optional): The date and time to schedule the task. Defaults to None.
            do_not_use_cluster_ops (bool, optional): If True, does not use cluster ops. Defaults to False.
            *args: Additional positional arguments to be passed to the execution method.
            **kwargs: Additional keyword arguments to be passed to the execution method.
@ -2510,21 +2527,20 @@ class Agent:
        """
        if not isinstance(task, str):
-            task = any_to_str(task)
+            task = format_data_structure(task)
        if scheduled_run_date:
            while datetime.now() < scheduled_run_date:
                time.sleep(
                    1
                )  # Sleep for a short period to avoid busy waiting
        try:
-            output = self._run(
+            if exists(imgs):
-                task=task,
+                output = self.run_multiple_images(
-                img=img,
+                    task=task, imgs=imgs, *args, **kwargs
-                *args,
+                )
-                **kwargs,
+            else:
-            )
+                output = self._run(
                    task=task,
                    img=img,
                    *args,
                    **kwargs,
                )
            return output
@ -2781,7 +2797,7 @@ class Agent:
                )
                # tool_response = format_data_structure(tool_response)
-                print(f"Multiple MCP Tool Response: {tool_response}")
+                # print(f"Multiple MCP Tool Response: {tool_response}")
            else:
                raise AgentMCPConnectionError(
                    "mcp_url must be either a string URL or MCPConnection object"
@ -2888,3 +2904,58 @@ class Agent:
    def list_output_types(self):
        return OutputType
    def run_multiple_images(
        self, task: str, imgs: List[str], *args, **kwargs
    ):
        """
        Run the agent with multiple images.
        Args:
            task (str): The task to be performed on each image.
            imgs (List[str]): List of image paths or URLs to process.
            *args: Additional positional arguments to pass to the agent's run method.
            **kwargs: Additional keyword arguments to pass to the agent's run method.
        Returns:
            List[Any]: A list of outputs generated for each image in the same order as the input images.
        Examples:
            >>> agent = Agent()
            >>> outputs = agent.run_multiple_images(
            ...     task="Describe what you see in this image",
            ...     imgs=["image1.jpg", "image2.png", "image3.jpeg"]
            ... )
            >>> print(f"Processed {len(outputs)} images")
            Processed 3 images
        Raises:
            Exception: If an error occurs while processing any of the images.
        """
        outputs = []
        for img in imgs:
            output = self.run(task=task, img=img, *args, **kwargs)
            outputs.append(output)
        # Combine the outputs into a single string
        if self.summarize_multiple_images is True:
            output = "\n".join(outputs)
            prompt = f"""
            You have already analyzed {len(outputs)} images and provided detailed descriptions for each one. 
            Now, based on your previous analysis of these images, create a comprehensive report that:
            1. Synthesizes the key findings across all images
            2. Identifies common themes, patterns, or relationships between the images
            3. Provides an overall summary that captures the most important insights
            4. Highlights any notable differences or contrasts between the images
            Here are your previous analyses of the images:
            {output}
            Please create a well-structured report that brings together your insights from all {len(outputs)} images.
            """
            outputs = self.run(task=prompt, *args, **kwargs)
        return outputs
--- a/swarms/tools/base_tool.py
+++ b/swarms/tools/base_tool.py
@ -2258,14 +2258,14 @@ class BaseTool(BaseModel):
                except json.JSONDecodeError as e:
                    self._log_if_verbose(
                        "error",
-                        f"Failed to parse JSON from API response: {e}. Response: '{api_response[:100]}...'"
+                        f"Failed to parse JSON from API response: {e}. Response: '{api_response[:100]}...'",
                    )
                    return []
            if not isinstance(api_response, dict):
                self._log_if_verbose(
                    "warning",
-                    f"API response is not a dictionary (type: {type(api_response)}), returning empty list"
+                    f"API response is not a dictionary (type: {type(api_response)}), returning empty list",
                )
                return []