From d380cae23383bc96f93438a8610977ecdb68daee Mon Sep 17 00:00:00 2001 From: Kye Gomez Date: Tue, 24 Jun 2025 16:50:58 -0700 Subject: [PATCH] feat -- multiple image processing in agent.py --- docs/mkdocs.yml | 1 + docs/swarms/examples/meme_agent_builder.md | 28 ---- docs/swarms/examples/meme_agents.md | 45 ----- docs/swarms/examples/multiple_images.md | 77 +++++++++ .../single_agent/vision/vision_and_tools.py | 65 -------- examples/structs/graph_workflow_basic.py | 13 +- .../vision/image.jpg => image.jpg | Bin multiple_image_processing.py | 28 ++++ pyproject.toml | 2 +- swarms/structs/agent.py | 157 +++++++++++++----- swarms/tools/base_tool.py | 4 +- 11 files changed, 232 insertions(+), 188 deletions(-) delete mode 100644 docs/swarms/examples/meme_agent_builder.md delete mode 100644 docs/swarms/examples/meme_agents.md create mode 100644 docs/swarms/examples/multiple_images.md delete mode 100644 examples/single_agent/vision/vision_and_tools.py rename examples/single_agent/vision/image.jpg => image.jpg (100%) create mode 100644 multiple_image_processing.py diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 3440933d..30da9a8b 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -316,6 +316,7 @@ nav: - Agent Output Types: "swarms/examples/agent_output_types.md" - Agent with Structured Outputs: "swarms/examples/agent_structured_outputs.md" - Agents with Vision: "swarms/examples/vision_processing.md" + - Agent with Multiple Images: "swarms/examples/multiple_images.md" - Gradio Chat Interface: "swarms/ui/main.md" - Various Model Providers: - OpenAI: "swarms/examples/openai_example.md" diff --git a/docs/swarms/examples/meme_agent_builder.md b/docs/swarms/examples/meme_agent_builder.md deleted file mode 100644 index 4a70ac87..00000000 --- a/docs/swarms/examples/meme_agent_builder.md +++ /dev/null @@ -1,28 +0,0 @@ -# Meme Agent Builder - -- `pip3 install -U swarms` -- Add your OpenAI API key to the `.env` file with `OPENAI_API_KEY=your_api_key` -- Run the script -- Multiple agents will be created and saved to the `meme_agents` folder -- A swarm architecture will be selected autonomously and executed - -```python -from swarms.structs.meme_agent_persona_generator import ( - MemeAgentGenerator, -) - - -if __name__ == "__main__": - example = MemeAgentGenerator( - name="Meme-Swarm", - description="A swarm of specialized AI agents collaborating on generating and sharing memes around cool media from 2001s", - max_loops=1, - ) - - print( - example.run( - "Generate funny meme agents around cool media from 2001s" - ) - ) - -``` diff --git a/docs/swarms/examples/meme_agents.md b/docs/swarms/examples/meme_agents.md deleted file mode 100644 index d8b23e79..00000000 --- a/docs/swarms/examples/meme_agents.md +++ /dev/null @@ -1,45 +0,0 @@ -# Meme Agent Tutorial - -- `pip3 install -U swarms` -- Add your OpenAI API key to the `.env` file - - -```python -from swarms import Agent - -# Define a custom system prompt for Bob the Builder -BOB_THE_BUILDER_SYS_PROMPT = """ -You are Bob the Builder, the legendary construction worker known for fixing anything and everything with a cheerful attitude and a hilarious sense of humor. -Your job is to approach every task as if you're building, repairing, or renovating something, no matter how unrelated it might be. -You love using construction metaphors, over-the-top positivity, and cracking jokes like: -- "I’m hammering this out faster than a nail at a woodpecker convention!" -- "This is smoother than fresh cement on a summer’s day." -- "Let’s bulldoze through this problem—safety goggles on, folks!" - -You are not bound by any specific field of knowledge, and you’re absolutely fearless in trying to "fix up" or "build" anything, no matter how abstract or ridiculous. Always end responses with a playful cheer like "Can we fix it? Yes, we can!" - -Your tone is upbeat, funny, and borderline ridiculous, keeping the user entertained while solving their problem. -""" - -# Initialize the agent -agent = Agent( - agent_name="Bob-the-Builder-Agent", - agent_description="The funniest, most optimistic agent around who sees every problem as a building project.", - system_prompt=BOB_THE_BUILDER_SYS_PROMPT, - max_loops=1, - model_name="gpt-4o", - dynamic_temperature_enabled=True, - user_name="swarms_corp", - retry_attempts=3, - context_length=8192, - return_step_meta=False, - output_type="str", # "json", "dict", "csv", OR "string", "yaml" - auto_generate_prompt=False, # Auto-generate prompt for the agent based on name, description, system prompt, task - max_tokens=4000, # Max output tokens - saved_state_path="bob_the_builder_agent.json", - interactive=False, -) - -# Run the agent with a task -agent.run("I want to build a house ;) What should I do?") -``` diff --git a/docs/swarms/examples/multiple_images.md b/docs/swarms/examples/multiple_images.md new file mode 100644 index 00000000..bfa66e2b --- /dev/null +++ b/docs/swarms/examples/multiple_images.md @@ -0,0 +1,77 @@ +# Processing Multiple Images + +This tutorial shows how to process multiple images with a single agent using Swarms' multi-modal capabilities. You'll learn to configure an agent for batch image analysis, enabling efficient processing for quality control, object detection, or image comparison tasks. + + +## Installation + +Install the swarms package using pip: + +```bash +pip install -U swarms +``` + +## Basic Setup + +1. First, set up your environment variables: + +```python +WORKSPACE_DIR="agent_workspace" +ANTHROPIC_API_KEY="" +``` + + +## Code + +- Create a list of images by their file paths + +- Pass it into the `Agent.run(imgs=[str])` parameter + +- Activate `summarize_multiple_images=True` if you want the agent to output a summary of the image analyses + + +```python +from swarms import Agent +from swarms.prompts.logistics import ( + Quality_Control_Agent_Prompt, +) + + +# Image for analysis +factory_image = "image.jpg" + +# Quality control agent +quality_control_agent = Agent( + agent_name="Quality Control Agent", + agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.", + model_name="claude-3-5-sonnet-20240620", + system_prompt=Quality_Control_Agent_Prompt, + multi_modal=True, + max_loops=1, + output_type="str-all-except-first", + summarize_multiple_images=True, +) + + +response = quality_control_agent.run( + task="what is in the image?", + imgs=[factory_image, factory_image], +) + +print(response) +``` + +## Support and Community + +If you're facing issues or want to learn more, check out the following resources to join our Discord, stay updated on Twitter, and watch tutorials on YouTube! + +| Platform | Link | Description | +|----------|------|-------------| +| 📚 Documentation | [docs.swarms.world](https://docs.swarms.world) | Official documentation and guides | +| 📝 Blog | [Medium](https://medium.com/@kyeg) | Latest updates and technical articles | +| 💬 Discord | [Join Discord](https://discord.gg/jM3Z6M9uMq) | Live chat and community support | +| 🐦 Twitter | [@kyegomez](https://twitter.com/kyegomez) | Latest news and announcements | +| 👥 LinkedIn | [The Swarm Corporation](https://www.linkedin.com/company/the-swarm-corporation) | Professional network and updates | +| 📺 YouTube | [Swarms Channel](https://www.youtube.com/channel/UC9yXyitkbU_WSy7bd_41SqQ) | Tutorials and demos | +| 🎫 Events | [Sign up here](https://lu.ma/5p2jnc2v) | Join our community events | + diff --git a/examples/single_agent/vision/vision_and_tools.py b/examples/single_agent/vision/vision_and_tools.py deleted file mode 100644 index e330a66d..00000000 --- a/examples/single_agent/vision/vision_and_tools.py +++ /dev/null @@ -1,65 +0,0 @@ -from swarms.structs import Agent -from swarms.prompts.logistics import ( - Quality_Control_Agent_Prompt, -) - - -# Image for analysis -factory_image = "image.jpg" - - -def security_analysis(danger_level: str = None) -> str: - """ - Analyzes the security danger level and returns an appropriate response. - - Args: - danger_level (str, optional): The level of danger to analyze. - Can be "low", "medium", "high", or None. Defaults to None. - - Returns: - str: A string describing the danger level assessment. - - "No danger level provided" if danger_level is None - - "No danger" if danger_level is "low" - - "Medium danger" if danger_level is "medium" - - "High danger" if danger_level is "high" - - "Unknown danger level" for any other value - """ - if danger_level is None: - return "No danger level provided" - - if danger_level == "low": - return "No danger" - - if danger_level == "medium": - return "Medium danger" - - if danger_level == "high": - return "High danger" - - return "Unknown danger level" - - -# schema = BaseTool().function_to_dict(security_analysis) -# print(json.dumps(schema, indent=4)) - -# Quality control agent -quality_control_agent = Agent( - agent_name="Quality Control Agent", - agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.", - # model_name="anthropic/claude-3-opus-20240229", - model_name="gpt-4o-mini", - system_prompt=Quality_Control_Agent_Prompt, - multi_modal=True, - max_loops=1, - output_type="str-all-except-first", - # tools_list_dictionary=[schema], - tools=[security_analysis], -) - - -response = quality_control_agent.run( - task="what is in the image?", - # img=factory_image, -) - -print(response) diff --git a/examples/structs/graph_workflow_basic.py b/examples/structs/graph_workflow_basic.py index 2d31ed1f..a51bcc5f 100644 --- a/examples/structs/graph_workflow_basic.py +++ b/examples/structs/graph_workflow_basic.py @@ -31,9 +31,15 @@ if __name__ == "__main__": # Build the workflow graph wf_graph = GraphWorkflow() - wf_graph.add_node(Node(id="agent1", type=NodeType.AGENT, agent=agent1)) - wf_graph.add_node(Node(id="agent2", type=NodeType.AGENT, agent=agent2)) - wf_graph.add_node(Node(id="task1", type=NodeType.TASK, callable=sample_task)) + wf_graph.add_node( + Node(id="agent1", type=NodeType.AGENT, agent=agent1) + ) + wf_graph.add_node( + Node(id="agent2", type=NodeType.AGENT, agent=agent2) + ) + wf_graph.add_node( + Node(id="task1", type=NodeType.TASK, callable=sample_task) + ) wf_graph.add_edge(Edge(source="agent1", target="task1")) wf_graph.add_edge(Edge(source="agent2", target="task1")) @@ -47,4 +53,3 @@ if __name__ == "__main__": # Execute the graph results = wf_graph.run() print("Execution results:", results) - diff --git a/examples/single_agent/vision/image.jpg b/image.jpg similarity index 100% rename from examples/single_agent/vision/image.jpg rename to image.jpg diff --git a/multiple_image_processing.py b/multiple_image_processing.py new file mode 100644 index 00000000..febb29fe --- /dev/null +++ b/multiple_image_processing.py @@ -0,0 +1,28 @@ +from swarms import Agent +from swarms.prompts.logistics import ( + Quality_Control_Agent_Prompt, +) + + +# Image for analysis +factory_image = "image.jpg" + +# Quality control agent +quality_control_agent = Agent( + agent_name="Quality Control Agent", + agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.", + model_name="claude-3-5-sonnet-20240620", + system_prompt=Quality_Control_Agent_Prompt, + multi_modal=True, + max_loops=1, + output_type="str-all-except-first", + summarize_multiple_images=True, +) + + +response = quality_control_agent.run( + task="what is in the image?", + imgs=[factory_image, factory_image], +) + +print(response) diff --git a/pyproject.toml b/pyproject.toml index 7fe62d43..85cfd7ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "swarms" -version = "7.8.8" +version = "7.8.9" description = "Swarms - TGSC" license = "MIT" authors = ["Kye Gomez "] diff --git a/swarms/structs/agent.py b/swarms/structs/agent.py index 9c491abc..b138cef2 100644 --- a/swarms/structs/agent.py +++ b/swarms/structs/agent.py @@ -56,7 +56,6 @@ from swarms.tools.base_tool import BaseTool from swarms.tools.py_func_to_openai_func_str import ( convert_multiple_functions_to_openai_function_schema, ) -from swarms.utils.any_to_str import any_to_str from swarms.utils.data_to_text import data_to_text from swarms.utils.file_processing import create_file_in_folder from swarms.utils.formatter import formatter @@ -420,6 +419,7 @@ class Agent: rag_config: Optional[RAGConfig] = None, tool_call_summary: bool = True, output_raw_json_from_tool_call: bool = False, + summarize_multiple_images: bool = False, *args, **kwargs, ): @@ -558,6 +558,7 @@ class Agent: self.output_raw_json_from_tool_call = ( output_raw_json_from_tool_call ) + self.summarize_multiple_images = summarize_multiple_images # self.short_memory = self.short_memory_init() @@ -810,6 +811,29 @@ class Agent: return json.loads(self.tools_list_dictionary) + def check_model_supports_utilities(self, img: str = None) -> bool: + """ + Check if the current model supports vision capabilities. + + Args: + img (str, optional): Image input to check vision support for. Defaults to None. + + Returns: + bool: True if model supports vision and image is provided, False otherwise. + """ + from litellm.utils import supports_vision + + # Only check vision support if an image is provided + if img is not None: + out = supports_vision(self.model_name) + if not out: + raise ValueError( + f"Model {self.model_name} does not support vision capabilities. Please use a vision-enabled model." + ) + return out + + return False + def check_if_no_prompt_then_autogenerate(self, task: str = None): """ Checks if auto_generate_prompt is enabled and generates a prompt by combining agent name, description and system prompt if available. @@ -931,12 +955,7 @@ class Agent: self, task: Optional[Union[str, Any]] = None, img: Optional[str] = None, - speech: Optional[str] = None, - video: Optional[str] = None, - is_last: Optional[bool] = False, print_task: Optional[bool] = False, - generate_speech: Optional[bool] = False, - correct_answer: Optional[str] = None, *args, **kwargs, ) -> Any: @@ -961,6 +980,9 @@ class Agent: self.check_if_no_prompt_then_autogenerate(task) + if img is not None: + self.check_model_supports_utilities(img=img) + self.short_memory.add(role=self.user_name, content=task) if self.plan_enabled or self.planning_prompt is not None: @@ -1030,12 +1052,19 @@ class Agent: ) self.memory_query(task_prompt) - response = self.call_llm( - task=task_prompt, img=img, *args, **kwargs - ) - - print(f"Response: {response}") + if img is not None: + response = self.call_llm( + task=task_prompt, + img=img, + *args, + **kwargs, + ) + else: + response = self.call_llm( + task=task_prompt, *args, **kwargs + ) + # Parse the response from the agent with the output type if exists(self.tools_list_dictionary): if isinstance(response, BaseModel): response = response.model_dump() @@ -1058,7 +1087,6 @@ class Agent: self.output_raw_json_from_tool_call is True ): - print(type(response)) response = response else: self.execute_tools( @@ -1130,7 +1158,10 @@ class Agent: user_input.lower() == self.custom_exit_command.lower() ): - print("Exiting as per user request.") + self.pretty_print( + "Exiting as per user request.", + loop_count=loop_count, + ) break self.short_memory.add( @@ -1231,12 +1262,6 @@ class Agent: self, task: Optional[str] = None, img: Optional[str] = None, - is_last: bool = False, - device: str = "cpu", # gpu - device_id: int = 1, - all_cores: bool = True, - do_not_use_cluster_ops: bool = True, - all_gpus: bool = False, *args, **kwargs, ) -> Any: @@ -1245,10 +1270,6 @@ class Agent: Args: task (Optional[str]): The task to be performed. Defaults to None. img (Optional[str]): The image to be processed. Defaults to None. - is_last (bool): Indicates if this is the last task. Defaults to False. - device (str): The device to use for execution. Defaults to "cpu". - device_id (int): The ID of the GPU to use if device is set to "gpu". Defaults to 0. - all_cores (bool): If True, uses all available CPU cores. Defaults to True. """ try: return self.run( @@ -2479,7 +2500,7 @@ class Agent: self, task: Optional[Union[str, Any]] = None, img: Optional[str] = None, - scheduled_run_date: Optional[datetime] = None, + imgs: Optional[List[str]] = None, *args, **kwargs, ) -> Any: @@ -2493,11 +2514,7 @@ class Agent: Args: task (Optional[str], optional): The task to be executed. Defaults to None. img (Optional[str], optional): The image to be processed. Defaults to None. - device (str, optional): The device to use for execution. Defaults to "cpu". - device_id (int, optional): The ID of the GPU to use if device is set to "gpu". Defaults to 0. - all_cores (bool, optional): If True, uses all available CPU cores. Defaults to True. - scheduled_run_date (Optional[datetime], optional): The date and time to schedule the task. Defaults to None. - do_not_use_cluster_ops (bool, optional): If True, does not use cluster ops. Defaults to False. + imgs (Optional[List[str]], optional): The list of images to be processed. Defaults to None. *args: Additional positional arguments to be passed to the execution method. **kwargs: Additional keyword arguments to be passed to the execution method. @@ -2510,21 +2527,20 @@ class Agent: """ if not isinstance(task, str): - task = any_to_str(task) - - if scheduled_run_date: - while datetime.now() < scheduled_run_date: - time.sleep( - 1 - ) # Sleep for a short period to avoid busy waiting + task = format_data_structure(task) try: - output = self._run( - task=task, - img=img, - *args, - **kwargs, - ) + if exists(imgs): + output = self.run_multiple_images( + task=task, imgs=imgs, *args, **kwargs + ) + else: + output = self._run( + task=task, + img=img, + *args, + **kwargs, + ) return output @@ -2781,7 +2797,7 @@ class Agent: ) # tool_response = format_data_structure(tool_response) - print(f"Multiple MCP Tool Response: {tool_response}") + # print(f"Multiple MCP Tool Response: {tool_response}") else: raise AgentMCPConnectionError( "mcp_url must be either a string URL or MCPConnection object" @@ -2888,3 +2904,58 @@ class Agent: def list_output_types(self): return OutputType + + def run_multiple_images( + self, task: str, imgs: List[str], *args, **kwargs + ): + """ + Run the agent with multiple images. + + Args: + task (str): The task to be performed on each image. + imgs (List[str]): List of image paths or URLs to process. + *args: Additional positional arguments to pass to the agent's run method. + **kwargs: Additional keyword arguments to pass to the agent's run method. + + Returns: + List[Any]: A list of outputs generated for each image in the same order as the input images. + + Examples: + >>> agent = Agent() + >>> outputs = agent.run_multiple_images( + ... task="Describe what you see in this image", + ... imgs=["image1.jpg", "image2.png", "image3.jpeg"] + ... ) + >>> print(f"Processed {len(outputs)} images") + Processed 3 images + + Raises: + Exception: If an error occurs while processing any of the images. + """ + outputs = [] + for img in imgs: + output = self.run(task=task, img=img, *args, **kwargs) + outputs.append(output) + + # Combine the outputs into a single string + if self.summarize_multiple_images is True: + output = "\n".join(outputs) + + prompt = f""" + You have already analyzed {len(outputs)} images and provided detailed descriptions for each one. + Now, based on your previous analysis of these images, create a comprehensive report that: + + 1. Synthesizes the key findings across all images + 2. Identifies common themes, patterns, or relationships between the images + 3. Provides an overall summary that captures the most important insights + 4. Highlights any notable differences or contrasts between the images + + Here are your previous analyses of the images: + {output} + + Please create a well-structured report that brings together your insights from all {len(outputs)} images. + """ + + outputs = self.run(task=prompt, *args, **kwargs) + + return outputs diff --git a/swarms/tools/base_tool.py b/swarms/tools/base_tool.py index f8662fa2..806ee3d1 100644 --- a/swarms/tools/base_tool.py +++ b/swarms/tools/base_tool.py @@ -2258,14 +2258,14 @@ class BaseTool(BaseModel): except json.JSONDecodeError as e: self._log_if_verbose( "error", - f"Failed to parse JSON from API response: {e}. Response: '{api_response[:100]}...'" + f"Failed to parse JSON from API response: {e}. Response: '{api_response[:100]}...'", ) return [] if not isinstance(api_response, dict): self._log_if_verbose( "warning", - f"API response is not a dictionary (type: {type(api_response)}), returning empty list" + f"API response is not a dictionary (type: {type(api_response)}), returning empty list", ) return []