update to swamrs spec

pull/1000/head
Filip Michalsky 1 month ago
parent 2d7dfca4a4
commit 4ec84a9289

@ -19,7 +19,6 @@ from dotenv import load_dotenv
from loguru import logger from loguru import logger
from swarms import Agent from swarms import Agent
from swarms.tools.base_tool import BaseTool
from stagehand import Stagehand, StagehandConfig from stagehand import Stagehand, StagehandConfig
load_dotenv() load_dotenv()
@ -81,222 +80,283 @@ class BrowserState:
browser_state = BrowserState() browser_state = BrowserState()
class NavigateTool(BaseTool): def navigate_browser(url: str) -> str:
"""Tool for navigating to URLs in the browser.""" """
Navigate to a URL in the browser.
def __init__(self):
super().__init__( Args:
name="navigate_browser", url (str): The URL to navigate to. Should be a valid URL starting with http:// or https://.
description="Navigate to a URL in the browser. Input should be a valid URL starting with http:// or https://", If no protocol is provided, https:// will be added automatically.
verbose=True,
) Returns:
str: Success message with the URL navigated to, or error message if navigation fails
def run(self, url: str) -> str:
"""Navigate to the specified URL.""" Raises:
return asyncio.run(self._async_run(url)) RuntimeError: If browser initialization fails
Exception: If navigation to the URL fails
async def _async_run(self, url: str) -> str:
try: Example:
await browser_state.init_browser() >>> result = navigate_browser("https://example.com")
page = await browser_state.get_page() >>> print(result)
"Successfully navigated to https://example.com"
# Ensure URL has protocol
if not url.startswith(("http://", "https://")): >>> result = navigate_browser("google.com")
url = f"https://{url}" >>> print(result)
"Successfully navigated to https://google.com"
await page.goto(url) """
return f"Successfully navigated to {url}" return asyncio.run(_navigate_browser_async(url))
except Exception as e:
logger.error(f"Navigation error: {str(e)}")
return f"Failed to navigate to {url}: {str(e)}" async def _navigate_browser_async(url: str) -> str:
"""Async implementation of navigate_browser."""
try:
class ActTool(BaseTool): await browser_state.init_browser()
"""Tool for performing actions on web pages.""" page = await browser_state.get_page()
def __init__(self): # Ensure URL has protocol
super().__init__( if not url.startswith(("http://", "https://")):
name="browser_act", url = f"https://{url}"
description=(
"Perform an action on the current web page using natural language. " await page.goto(url)
"Examples: 'click the submit button', 'type hello@example.com in the email field', " return f"Successfully navigated to {url}"
"'scroll down', 'press Enter'" except Exception as e:
), logger.error(f"Navigation error: {str(e)}")
verbose=True, return f"Failed to navigate to {url}: {str(e)}"
)
def run(self, action: str) -> str: def browser_act(action: str) -> str:
"""Perform the specified action.""" """
return asyncio.run(self._async_run(action)) Perform an action on the current web page using natural language.
async def _async_run(self, action: str) -> str: Args:
try: action (str): Natural language description of the action to perform.
await browser_state.init_browser() Examples: 'click the submit button', 'type hello@example.com in the email field',
page = await browser_state.get_page() 'scroll down', 'press Enter', 'select option from dropdown'
result = await page.act(action) Returns:
return f"Action performed: {action}. Result: {result}" str: JSON formatted string with action result and status information
except Exception as e:
logger.error(f"Action error: {str(e)}") Raises:
return f"Failed to perform action '{action}': {str(e)}" RuntimeError: If browser is not initialized or page is not available
Exception: If the action cannot be performed on the current page
class ExtractTool(BaseTool): Example:
"""Tool for extracting data from web pages.""" >>> result = browser_act("click the submit button")
>>> print(result)
def __init__(self): "Action performed: click the submit button. Result: clicked successfully"
super().__init__(
name="browser_extract", >>> result = browser_act("type hello@example.com in the email field")
description=( >>> print(result)
"Extract information from the current web page using natural language. " "Action performed: type hello@example.com in the email field. Result: text entered"
"Examples: 'extract all email addresses', 'get the main article text', " """
"'find all product prices', 'extract the page title and meta description'" return asyncio.run(_browser_act_async(action))
),
verbose=True,
) async def _browser_act_async(action: str) -> str:
"""Async implementation of browser_act."""
def run(self, query: str) -> str: try:
"""Extract information based on the query.""" await browser_state.init_browser()
return asyncio.run(self._async_run(query)) page = await browser_state.get_page()
async def _async_run(self, query: str) -> str: result = await page.act(action)
try: return f"Action performed: {action}. Result: {result}"
await browser_state.init_browser() except Exception as e:
page = await browser_state.get_page() logger.error(f"Action error: {str(e)}")
return f"Failed to perform action '{action}': {str(e)}"
extracted = await page.extract(query)
# Convert to JSON string for agent consumption def browser_extract(query: str) -> str:
if isinstance(extracted, (dict, list)): """
return json.dumps(extracted, indent=2) Extract information from the current web page using natural language.
else:
return str(extracted) Args:
except Exception as e: query (str): Natural language description of what information to extract.
logger.error(f"Extraction error: {str(e)}") Examples: 'extract all email addresses', 'get the main article text',
return f"Failed to extract '{query}': {str(e)}" 'find all product prices', 'extract the page title and meta description'
Returns:
class ObserveTool(BaseTool): str: JSON formatted string containing the extracted information, or error message if extraction fails
"""Tool for observing elements on web pages."""
Raises:
def __init__(self): RuntimeError: If browser is not initialized or page is not available
super().__init__( Exception: If extraction fails due to page content or parsing issues
name="browser_observe",
description=( Example:
"Observe and find elements on the current web page using natural language. " >>> result = browser_extract("extract all email addresses")
"Returns information about elements including their selectors. " >>> print(result)
"Examples: 'find the search box', 'locate the submit button', " '["contact@example.com", "support@example.com"]'
"'find all navigation links'"
), >>> result = browser_extract("get the main article text")
verbose=True, >>> print(result)
) '{"title": "Article Title", "content": "Article content..."}'
"""
def run(self, query: str) -> str: return asyncio.run(_browser_extract_async(query))
"""Observe elements based on the query."""
return asyncio.run(self._async_run(query))
async def _browser_extract_async(query: str) -> str:
async def _async_run(self, query: str) -> str: """Async implementation of browser_extract."""
try: try:
await browser_state.init_browser() await browser_state.init_browser()
page = await browser_state.get_page() page = await browser_state.get_page()
observations = await page.observe(query) extracted = await page.extract(query)
# Format observations for readability # Convert to JSON string for agent consumption
result = [] if isinstance(extracted, (dict, list)):
for obs in observations: return json.dumps(extracted, indent=2)
result.append( else:
{ return str(extracted)
"description": obs.description, except Exception as e:
"selector": obs.selector, logger.error(f"Extraction error: {str(e)}")
"method": obs.method, return f"Failed to extract '{query}': {str(e)}"
}
)
def browser_observe(query: str) -> str:
return json.dumps(result, indent=2) """
except Exception as e: Observe and find elements on the current web page using natural language.
logger.error(f"Observation error: {str(e)}")
return f"Failed to observe '{query}': {str(e)}" Args:
query (str): Natural language description of elements to find.
Examples: 'find the search box', 'locate the submit button',
class ScreenshotTool(BaseTool): 'find all navigation links', 'observe form elements'
"""Tool for taking screenshots of the current page."""
Returns:
def __init__(self): str: JSON formatted string containing information about found elements including
super().__init__( their descriptions, selectors, and interaction methods
name="browser_screenshot",
description="Take a screenshot of the current web page. Optionally provide a filename.", Raises:
verbose=True, RuntimeError: If browser is not initialized or page is not available
) Exception: If observation fails due to page structure or element detection issues
def run(self, filename: str = "screenshot.png") -> str: Example:
"""Take a screenshot.""" >>> result = browser_observe("find the search box")
return asyncio.run(self._async_run(filename)) >>> print(result)
'[{"description": "Search input field", "selector": "#search", "method": "input"}]'
async def _async_run(self, filename: str) -> str:
try: >>> result = browser_observe("locate the submit button")
await browser_state.init_browser() >>> print(result)
page = await browser_state.get_page() '[{"description": "Submit button", "selector": "button[type=submit]", "method": "click"}]'
"""
# Ensure .png extension return asyncio.run(_browser_observe_async(query))
if not filename.endswith(".png"):
filename += ".png"
async def _browser_observe_async(query: str) -> str:
# Get the underlying Playwright page """Async implementation of browser_observe."""
playwright_page = page.page try:
await playwright_page.screenshot(path=filename) await browser_state.init_browser()
page = await browser_state.get_page()
return f"Screenshot saved to {filename}"
except Exception as e: observations = await page.observe(query)
logger.error(f"Screenshot error: {str(e)}")
return f"Failed to take screenshot: {str(e)}" # Format observations for readability
result = []
for obs in observations:
class CloseBrowserTool(BaseTool): result.append(
"""Tool for closing the browser.""" {
"description": obs.description,
def __init__(self): "selector": obs.selector,
super().__init__( "method": obs.method,
name="close_browser", }
description="Close the browser when done with automation tasks", )
verbose=True,
)
def run(self, *args) -> str:
"""Close the browser."""
return asyncio.run(self._async_run())
async def _async_run(self) -> str: return json.dumps(result, indent=2)
try: except Exception as e:
await browser_state.close() logger.error(f"Observation error: {str(e)}")
return "Browser closed successfully" return f"Failed to observe '{query}': {str(e)}"
except Exception as e:
logger.error(f"Close browser error: {str(e)}")
return f"Failed to close browser: {str(e)}" def browser_screenshot(filename: str = "screenshot.png") -> str:
"""
Take a screenshot of the current web page.
Args:
filename (str, optional): The filename to save the screenshot to.
Defaults to "screenshot.png".
.png extension will be added automatically if not provided.
Returns:
str: Success message with the filename where screenshot was saved,
or error message if screenshot fails
Raises:
RuntimeError: If browser is not initialized or page is not available
Exception: If screenshot capture or file saving fails
Example:
>>> result = browser_screenshot()
>>> print(result)
"Screenshot saved to screenshot.png"
>>> result = browser_screenshot("page_capture.png")
>>> print(result)
"Screenshot saved to page_capture.png"
"""
return asyncio.run(_browser_screenshot_async(filename))
async def _browser_screenshot_async(filename: str) -> str:
"""Async implementation of browser_screenshot."""
try:
await browser_state.init_browser()
page = await browser_state.get_page()
# Ensure .png extension
if not filename.endswith(".png"):
filename += ".png"
# Get the underlying Playwright page
playwright_page = page.page
await playwright_page.screenshot(path=filename)
return f"Screenshot saved to {filename}"
except Exception as e:
logger.error(f"Screenshot error: {str(e)}")
return f"Failed to take screenshot: {str(e)}"
def close_browser() -> str:
"""
Close the browser when done with automation tasks.
Returns:
str: Success message if browser is closed successfully,
or error message if closing fails
Raises:
Exception: If browser closing process encounters errors
Example:
>>> result = close_browser()
>>> print(result)
"Browser closed successfully"
"""
return asyncio.run(_close_browser_async())
async def _close_browser_async() -> str:
"""Async implementation of close_browser."""
try:
await browser_state.close()
return "Browser closed successfully"
except Exception as e:
logger.error(f"Close browser error: {str(e)}")
return f"Failed to close browser: {str(e)}"
# Example usage # Example usage
if __name__ == "__main__": if __name__ == "__main__":
# Create browser automation tools
navigate_tool = NavigateTool()
act_tool = ActTool()
extract_tool = ExtractTool()
observe_tool = ObserveTool()
screenshot_tool = ScreenshotTool()
close_browser_tool = CloseBrowserTool()
# Create a Swarms agent with browser tools # Create a Swarms agent with browser tools
browser_agent = Agent( browser_agent = Agent(
agent_name="BrowserAutomationAgent", agent_name="BrowserAutomationAgent",
model_name="gpt-4o-mini", model_name="gpt-4o-mini",
max_loops=1, max_loops=1,
tools=[ tools=[
navigate_tool, navigate_browser,
act_tool, browser_act,
extract_tool, browser_extract,
observe_tool, browser_observe,
screenshot_tool, browser_screenshot,
close_browser_tool, close_browser,
], ],
system_prompt="""You are a web browser automation specialist. You can: system_prompt="""You are a web browser automation specialist. You can:
1. Navigate to websites using the navigate_browser tool 1. Navigate to websites using the navigate_browser tool

@ -55,12 +55,10 @@ class TestStagehandIntegrationStructure:
content = f.read() content = f.read()
# Check for required imports # Check for required imports
assert ( assert "from swarms import Agent" in content
"from swarms.tools.base_tool import BaseTool" in content assert "def navigate_browser" in content
) assert "def browser_act" in content
assert "class NavigateTool" in content assert "def browser_extract" in content
assert "class ActTool" in content
assert "class ExtractTool" in content
def test_mcp_agent_imports(self): def test_mcp_agent_imports(self):
"""Test that MCP agent has correct imports.""" """Test that MCP agent has correct imports."""
@ -194,7 +192,7 @@ class TestSwarmsPatternsCompliance:
assert "return" in content assert "return" in content
def test_tools_pattern(self): def test_tools_pattern(self):
"""Test that tools follow Swarms BaseTool pattern.""" """Test that tools follow Swarms function-based pattern."""
# Read the tools agent file # Read the tools agent file
with open( with open(
@ -202,11 +200,11 @@ class TestSwarmsPatternsCompliance:
) as f: ) as f:
content = f.read() content = f.read()
# Check tool pattern # Check function-based tool pattern
assert "class NavigateTool(BaseTool):" in content assert "def navigate_browser(url: str) -> str:" in content
assert "def run(self," in content assert "def browser_act(action: str) -> str:" in content
assert "name=" in content assert "def browser_extract(query: str) -> str:" in content
assert "description=" in content assert "def browser_observe(query: str) -> str:" in content
def test_mcp_integration_pattern(self): def test_mcp_integration_pattern(self):
"""Test MCP integration follows Swarms pattern.""" """Test MCP integration follows Swarms pattern."""

Loading…
Cancel
Save