feat -- multiple image processing in agent.py

pull/889/merge
Kye Gomez 7 days ago
parent fcf52332d1
commit d380cae233

@ -316,6 +316,7 @@ nav:
- Agent Output Types: "swarms/examples/agent_output_types.md" - Agent Output Types: "swarms/examples/agent_output_types.md"
- Agent with Structured Outputs: "swarms/examples/agent_structured_outputs.md" - Agent with Structured Outputs: "swarms/examples/agent_structured_outputs.md"
- Agents with Vision: "swarms/examples/vision_processing.md" - Agents with Vision: "swarms/examples/vision_processing.md"
- Agent with Multiple Images: "swarms/examples/multiple_images.md"
- Gradio Chat Interface: "swarms/ui/main.md" - Gradio Chat Interface: "swarms/ui/main.md"
- Various Model Providers: - Various Model Providers:
- OpenAI: "swarms/examples/openai_example.md" - OpenAI: "swarms/examples/openai_example.md"

@ -1,28 +0,0 @@
# Meme Agent Builder
- `pip3 install -U swarms`
- Add your OpenAI API key to the `.env` file with `OPENAI_API_KEY=your_api_key`
- Run the script
- Multiple agents will be created and saved to the `meme_agents` folder
- A swarm architecture will be selected autonomously and executed
```python
from swarms.structs.meme_agent_persona_generator import (
MemeAgentGenerator,
)
if __name__ == "__main__":
example = MemeAgentGenerator(
name="Meme-Swarm",
description="A swarm of specialized AI agents collaborating on generating and sharing memes around cool media from 2001s",
max_loops=1,
)
print(
example.run(
"Generate funny meme agents around cool media from 2001s"
)
)
```

@ -1,45 +0,0 @@
# Meme Agent Tutorial
- `pip3 install -U swarms`
- Add your OpenAI API key to the `.env` file
```python
from swarms import Agent
# Define a custom system prompt for Bob the Builder
BOB_THE_BUILDER_SYS_PROMPT = """
You are Bob the Builder, the legendary construction worker known for fixing anything and everything with a cheerful attitude and a hilarious sense of humor.
Your job is to approach every task as if you're building, repairing, or renovating something, no matter how unrelated it might be.
You love using construction metaphors, over-the-top positivity, and cracking jokes like:
- "Im hammering this out faster than a nail at a woodpecker convention!"
- "This is smoother than fresh cement on a summers day."
- "Lets bulldoze through this problem—safety goggles on, folks!"
You are not bound by any specific field of knowledge, and youre absolutely fearless in trying to "fix up" or "build" anything, no matter how abstract or ridiculous. Always end responses with a playful cheer like "Can we fix it? Yes, we can!"
Your tone is upbeat, funny, and borderline ridiculous, keeping the user entertained while solving their problem.
"""
# Initialize the agent
agent = Agent(
agent_name="Bob-the-Builder-Agent",
agent_description="The funniest, most optimistic agent around who sees every problem as a building project.",
system_prompt=BOB_THE_BUILDER_SYS_PROMPT,
max_loops=1,
model_name="gpt-4o",
dynamic_temperature_enabled=True,
user_name="swarms_corp",
retry_attempts=3,
context_length=8192,
return_step_meta=False,
output_type="str", # "json", "dict", "csv", OR "string", "yaml"
auto_generate_prompt=False, # Auto-generate prompt for the agent based on name, description, system prompt, task
max_tokens=4000, # Max output tokens
saved_state_path="bob_the_builder_agent.json",
interactive=False,
)
# Run the agent with a task
agent.run("I want to build a house ;) What should I do?")
```

@ -0,0 +1,77 @@
# Processing Multiple Images
This tutorial shows how to process multiple images with a single agent using Swarms' multi-modal capabilities. You'll learn to configure an agent for batch image analysis, enabling efficient processing for quality control, object detection, or image comparison tasks.
## Installation
Install the swarms package using pip:
```bash
pip install -U swarms
```
## Basic Setup
1. First, set up your environment variables:
```python
WORKSPACE_DIR="agent_workspace"
ANTHROPIC_API_KEY=""
```
## Code
- Create a list of images by their file paths
- Pass it into the `Agent.run(imgs=[str])` parameter
- Activate `summarize_multiple_images=True` if you want the agent to output a summary of the image analyses
```python
from swarms import Agent
from swarms.prompts.logistics import (
Quality_Control_Agent_Prompt,
)
# Image for analysis
factory_image = "image.jpg"
# Quality control agent
quality_control_agent = Agent(
agent_name="Quality Control Agent",
agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.",
model_name="claude-3-5-sonnet-20240620",
system_prompt=Quality_Control_Agent_Prompt,
multi_modal=True,
max_loops=1,
output_type="str-all-except-first",
summarize_multiple_images=True,
)
response = quality_control_agent.run(
task="what is in the image?",
imgs=[factory_image, factory_image],
)
print(response)
```
## Support and Community
If you're facing issues or want to learn more, check out the following resources to join our Discord, stay updated on Twitter, and watch tutorials on YouTube!
| Platform | Link | Description |
|----------|------|-------------|
| 📚 Documentation | [docs.swarms.world](https://docs.swarms.world) | Official documentation and guides |
| 📝 Blog | [Medium](https://medium.com/@kyeg) | Latest updates and technical articles |
| 💬 Discord | [Join Discord](https://discord.gg/jM3Z6M9uMq) | Live chat and community support |
| 🐦 Twitter | [@kyegomez](https://twitter.com/kyegomez) | Latest news and announcements |
| 👥 LinkedIn | [The Swarm Corporation](https://www.linkedin.com/company/the-swarm-corporation) | Professional network and updates |
| 📺 YouTube | [Swarms Channel](https://www.youtube.com/channel/UC9yXyitkbU_WSy7bd_41SqQ) | Tutorials and demos |
| 🎫 Events | [Sign up here](https://lu.ma/5p2jnc2v) | Join our community events |

@ -1,65 +0,0 @@
from swarms.structs import Agent
from swarms.prompts.logistics import (
Quality_Control_Agent_Prompt,
)
# Image for analysis
factory_image = "image.jpg"
def security_analysis(danger_level: str = None) -> str:
"""
Analyzes the security danger level and returns an appropriate response.
Args:
danger_level (str, optional): The level of danger to analyze.
Can be "low", "medium", "high", or None. Defaults to None.
Returns:
str: A string describing the danger level assessment.
- "No danger level provided" if danger_level is None
- "No danger" if danger_level is "low"
- "Medium danger" if danger_level is "medium"
- "High danger" if danger_level is "high"
- "Unknown danger level" for any other value
"""
if danger_level is None:
return "No danger level provided"
if danger_level == "low":
return "No danger"
if danger_level == "medium":
return "Medium danger"
if danger_level == "high":
return "High danger"
return "Unknown danger level"
# schema = BaseTool().function_to_dict(security_analysis)
# print(json.dumps(schema, indent=4))
# Quality control agent
quality_control_agent = Agent(
agent_name="Quality Control Agent",
agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.",
# model_name="anthropic/claude-3-opus-20240229",
model_name="gpt-4o-mini",
system_prompt=Quality_Control_Agent_Prompt,
multi_modal=True,
max_loops=1,
output_type="str-all-except-first",
# tools_list_dictionary=[schema],
tools=[security_analysis],
)
response = quality_control_agent.run(
task="what is in the image?",
# img=factory_image,
)
print(response)

@ -31,9 +31,15 @@ if __name__ == "__main__":
# Build the workflow graph # Build the workflow graph
wf_graph = GraphWorkflow() wf_graph = GraphWorkflow()
wf_graph.add_node(Node(id="agent1", type=NodeType.AGENT, agent=agent1)) wf_graph.add_node(
wf_graph.add_node(Node(id="agent2", type=NodeType.AGENT, agent=agent2)) Node(id="agent1", type=NodeType.AGENT, agent=agent1)
wf_graph.add_node(Node(id="task1", type=NodeType.TASK, callable=sample_task)) )
wf_graph.add_node(
Node(id="agent2", type=NodeType.AGENT, agent=agent2)
)
wf_graph.add_node(
Node(id="task1", type=NodeType.TASK, callable=sample_task)
)
wf_graph.add_edge(Edge(source="agent1", target="task1")) wf_graph.add_edge(Edge(source="agent1", target="task1"))
wf_graph.add_edge(Edge(source="agent2", target="task1")) wf_graph.add_edge(Edge(source="agent2", target="task1"))
@ -47,4 +53,3 @@ if __name__ == "__main__":
# Execute the graph # Execute the graph
results = wf_graph.run() results = wf_graph.run()
print("Execution results:", results) print("Execution results:", results)

Before

Width:  |  Height:  |  Size: 232 KiB

After

Width:  |  Height:  |  Size: 232 KiB

@ -0,0 +1,28 @@
from swarms import Agent
from swarms.prompts.logistics import (
Quality_Control_Agent_Prompt,
)
# Image for analysis
factory_image = "image.jpg"
# Quality control agent
quality_control_agent = Agent(
agent_name="Quality Control Agent",
agent_description="A quality control agent that analyzes images and provides a detailed report on the quality of the product in the image.",
model_name="claude-3-5-sonnet-20240620",
system_prompt=Quality_Control_Agent_Prompt,
multi_modal=True,
max_loops=1,
output_type="str-all-except-first",
summarize_multiple_images=True,
)
response = quality_control_agent.run(
task="what is in the image?",
imgs=[factory_image, factory_image],
)
print(response)

@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "swarms" name = "swarms"
version = "7.8.8" version = "7.8.9"
description = "Swarms - TGSC" description = "Swarms - TGSC"
license = "MIT" license = "MIT"
authors = ["Kye Gomez <kye@apac.ai>"] authors = ["Kye Gomez <kye@apac.ai>"]

@ -56,7 +56,6 @@ from swarms.tools.base_tool import BaseTool
from swarms.tools.py_func_to_openai_func_str import ( from swarms.tools.py_func_to_openai_func_str import (
convert_multiple_functions_to_openai_function_schema, convert_multiple_functions_to_openai_function_schema,
) )
from swarms.utils.any_to_str import any_to_str
from swarms.utils.data_to_text import data_to_text from swarms.utils.data_to_text import data_to_text
from swarms.utils.file_processing import create_file_in_folder from swarms.utils.file_processing import create_file_in_folder
from swarms.utils.formatter import formatter from swarms.utils.formatter import formatter
@ -420,6 +419,7 @@ class Agent:
rag_config: Optional[RAGConfig] = None, rag_config: Optional[RAGConfig] = None,
tool_call_summary: bool = True, tool_call_summary: bool = True,
output_raw_json_from_tool_call: bool = False, output_raw_json_from_tool_call: bool = False,
summarize_multiple_images: bool = False,
*args, *args,
**kwargs, **kwargs,
): ):
@ -558,6 +558,7 @@ class Agent:
self.output_raw_json_from_tool_call = ( self.output_raw_json_from_tool_call = (
output_raw_json_from_tool_call output_raw_json_from_tool_call
) )
self.summarize_multiple_images = summarize_multiple_images
# self.short_memory = self.short_memory_init() # self.short_memory = self.short_memory_init()
@ -810,6 +811,29 @@ class Agent:
return json.loads(self.tools_list_dictionary) return json.loads(self.tools_list_dictionary)
def check_model_supports_utilities(self, img: str = None) -> bool:
"""
Check if the current model supports vision capabilities.
Args:
img (str, optional): Image input to check vision support for. Defaults to None.
Returns:
bool: True if model supports vision and image is provided, False otherwise.
"""
from litellm.utils import supports_vision
# Only check vision support if an image is provided
if img is not None:
out = supports_vision(self.model_name)
if not out:
raise ValueError(
f"Model {self.model_name} does not support vision capabilities. Please use a vision-enabled model."
)
return out
return False
def check_if_no_prompt_then_autogenerate(self, task: str = None): def check_if_no_prompt_then_autogenerate(self, task: str = None):
""" """
Checks if auto_generate_prompt is enabled and generates a prompt by combining agent name, description and system prompt if available. Checks if auto_generate_prompt is enabled and generates a prompt by combining agent name, description and system prompt if available.
@ -931,12 +955,7 @@ class Agent:
self, self,
task: Optional[Union[str, Any]] = None, task: Optional[Union[str, Any]] = None,
img: Optional[str] = None, img: Optional[str] = None,
speech: Optional[str] = None,
video: Optional[str] = None,
is_last: Optional[bool] = False,
print_task: Optional[bool] = False, print_task: Optional[bool] = False,
generate_speech: Optional[bool] = False,
correct_answer: Optional[str] = None,
*args, *args,
**kwargs, **kwargs,
) -> Any: ) -> Any:
@ -961,6 +980,9 @@ class Agent:
self.check_if_no_prompt_then_autogenerate(task) self.check_if_no_prompt_then_autogenerate(task)
if img is not None:
self.check_model_supports_utilities(img=img)
self.short_memory.add(role=self.user_name, content=task) self.short_memory.add(role=self.user_name, content=task)
if self.plan_enabled or self.planning_prompt is not None: if self.plan_enabled or self.planning_prompt is not None:
@ -1030,12 +1052,19 @@ class Agent:
) )
self.memory_query(task_prompt) self.memory_query(task_prompt)
response = self.call_llm( if img is not None:
task=task_prompt, img=img, *args, **kwargs response = self.call_llm(
) task=task_prompt,
img=img,
print(f"Response: {response}") *args,
**kwargs,
)
else:
response = self.call_llm(
task=task_prompt, *args, **kwargs
)
# Parse the response from the agent with the output type
if exists(self.tools_list_dictionary): if exists(self.tools_list_dictionary):
if isinstance(response, BaseModel): if isinstance(response, BaseModel):
response = response.model_dump() response = response.model_dump()
@ -1058,7 +1087,6 @@ class Agent:
self.output_raw_json_from_tool_call self.output_raw_json_from_tool_call
is True is True
): ):
print(type(response))
response = response response = response
else: else:
self.execute_tools( self.execute_tools(
@ -1130,7 +1158,10 @@ class Agent:
user_input.lower() user_input.lower()
== self.custom_exit_command.lower() == self.custom_exit_command.lower()
): ):
print("Exiting as per user request.") self.pretty_print(
"Exiting as per user request.",
loop_count=loop_count,
)
break break
self.short_memory.add( self.short_memory.add(
@ -1231,12 +1262,6 @@ class Agent:
self, self,
task: Optional[str] = None, task: Optional[str] = None,
img: Optional[str] = None, img: Optional[str] = None,
is_last: bool = False,
device: str = "cpu", # gpu
device_id: int = 1,
all_cores: bool = True,
do_not_use_cluster_ops: bool = True,
all_gpus: bool = False,
*args, *args,
**kwargs, **kwargs,
) -> Any: ) -> Any:
@ -1245,10 +1270,6 @@ class Agent:
Args: Args:
task (Optional[str]): The task to be performed. Defaults to None. task (Optional[str]): The task to be performed. Defaults to None.
img (Optional[str]): The image to be processed. Defaults to None. img (Optional[str]): The image to be processed. Defaults to None.
is_last (bool): Indicates if this is the last task. Defaults to False.
device (str): The device to use for execution. Defaults to "cpu".
device_id (int): The ID of the GPU to use if device is set to "gpu". Defaults to 0.
all_cores (bool): If True, uses all available CPU cores. Defaults to True.
""" """
try: try:
return self.run( return self.run(
@ -2479,7 +2500,7 @@ class Agent:
self, self,
task: Optional[Union[str, Any]] = None, task: Optional[Union[str, Any]] = None,
img: Optional[str] = None, img: Optional[str] = None,
scheduled_run_date: Optional[datetime] = None, imgs: Optional[List[str]] = None,
*args, *args,
**kwargs, **kwargs,
) -> Any: ) -> Any:
@ -2493,11 +2514,7 @@ class Agent:
Args: Args:
task (Optional[str], optional): The task to be executed. Defaults to None. task (Optional[str], optional): The task to be executed. Defaults to None.
img (Optional[str], optional): The image to be processed. Defaults to None. img (Optional[str], optional): The image to be processed. Defaults to None.
device (str, optional): The device to use for execution. Defaults to "cpu". imgs (Optional[List[str]], optional): The list of images to be processed. Defaults to None.
device_id (int, optional): The ID of the GPU to use if device is set to "gpu". Defaults to 0.
all_cores (bool, optional): If True, uses all available CPU cores. Defaults to True.
scheduled_run_date (Optional[datetime], optional): The date and time to schedule the task. Defaults to None.
do_not_use_cluster_ops (bool, optional): If True, does not use cluster ops. Defaults to False.
*args: Additional positional arguments to be passed to the execution method. *args: Additional positional arguments to be passed to the execution method.
**kwargs: Additional keyword arguments to be passed to the execution method. **kwargs: Additional keyword arguments to be passed to the execution method.
@ -2510,21 +2527,20 @@ class Agent:
""" """
if not isinstance(task, str): if not isinstance(task, str):
task = any_to_str(task) task = format_data_structure(task)
if scheduled_run_date:
while datetime.now() < scheduled_run_date:
time.sleep(
1
) # Sleep for a short period to avoid busy waiting
try: try:
output = self._run( if exists(imgs):
task=task, output = self.run_multiple_images(
img=img, task=task, imgs=imgs, *args, **kwargs
*args, )
**kwargs, else:
) output = self._run(
task=task,
img=img,
*args,
**kwargs,
)
return output return output
@ -2781,7 +2797,7 @@ class Agent:
) )
# tool_response = format_data_structure(tool_response) # tool_response = format_data_structure(tool_response)
print(f"Multiple MCP Tool Response: {tool_response}") # print(f"Multiple MCP Tool Response: {tool_response}")
else: else:
raise AgentMCPConnectionError( raise AgentMCPConnectionError(
"mcp_url must be either a string URL or MCPConnection object" "mcp_url must be either a string URL or MCPConnection object"
@ -2888,3 +2904,58 @@ class Agent:
def list_output_types(self): def list_output_types(self):
return OutputType return OutputType
def run_multiple_images(
self, task: str, imgs: List[str], *args, **kwargs
):
"""
Run the agent with multiple images.
Args:
task (str): The task to be performed on each image.
imgs (List[str]): List of image paths or URLs to process.
*args: Additional positional arguments to pass to the agent's run method.
**kwargs: Additional keyword arguments to pass to the agent's run method.
Returns:
List[Any]: A list of outputs generated for each image in the same order as the input images.
Examples:
>>> agent = Agent()
>>> outputs = agent.run_multiple_images(
... task="Describe what you see in this image",
... imgs=["image1.jpg", "image2.png", "image3.jpeg"]
... )
>>> print(f"Processed {len(outputs)} images")
Processed 3 images
Raises:
Exception: If an error occurs while processing any of the images.
"""
outputs = []
for img in imgs:
output = self.run(task=task, img=img, *args, **kwargs)
outputs.append(output)
# Combine the outputs into a single string
if self.summarize_multiple_images is True:
output = "\n".join(outputs)
prompt = f"""
You have already analyzed {len(outputs)} images and provided detailed descriptions for each one.
Now, based on your previous analysis of these images, create a comprehensive report that:
1. Synthesizes the key findings across all images
2. Identifies common themes, patterns, or relationships between the images
3. Provides an overall summary that captures the most important insights
4. Highlights any notable differences or contrasts between the images
Here are your previous analyses of the images:
{output}
Please create a well-structured report that brings together your insights from all {len(outputs)} images.
"""
outputs = self.run(task=prompt, *args, **kwargs)
return outputs

@ -2258,14 +2258,14 @@ class BaseTool(BaseModel):
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
self._log_if_verbose( self._log_if_verbose(
"error", "error",
f"Failed to parse JSON from API response: {e}. Response: '{api_response[:100]}...'" f"Failed to parse JSON from API response: {e}. Response: '{api_response[:100]}...'",
) )
return [] return []
if not isinstance(api_response, dict): if not isinstance(api_response, dict):
self._log_if_verbose( self._log_if_verbose(
"warning", "warning",
f"API response is not a dictionary (type: {type(api_response)}), returning empty list" f"API response is not a dictionary (type: {type(api_response)}), returning empty list",
) )
return [] return []

Loading…
Cancel
Save