[feat] add web search tool with Google Custom Search and content summarization

4 months ago · b584f4d4f5
parent 12109e3857
commit b584f4d4f5
1 changed files with 258 additions and 0 deletions
--- a/swarms/tools/web_search.py
+++ b/swarms/tools/web_search.py
@ -0,0 +1,258 @@
+import asyncio
+import aiohttp
+from bs4 import BeautifulSoup
+import json
+import os
+from typing import List, Dict, Optional
+from dotenv import load_dotenv
+import google.generativeai as genai
+from rich.console import Console
+from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
+import html2text
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from playwright.sync_api import sync_playwright
+import time
+from tenacity import retry, stop_after_attempt, wait_exponential
+
+console = Console()
+load_dotenv()
+
+class WebsiteChecker:
+    def __init__(self):
+        self.google_api_key = os.getenv("GOOGLE_API_KEY")
+        self.google_cx = os.getenv("GOOGLE_CX")
+        self.gemini_api_key = os.getenv("GEMINI_API_KEY")
+        self.outputs_dir = "outputs"
+        os.makedirs(self.outputs_dir, exist_ok=True)
+        
+        # Initialize html2text
+        self.html_converter = html2text.HTML2Text()
+        self.html_converter.ignore_links = True
+        self.html_converter.ignore_images = True
+        self.html_converter.ignore_emphasis = True
+        
+        # Configure retry settings
+        self.max_retries = 3
+        self.max_threads = 10  # Concurrent threads
+        self.timeout = 15  # Seconds
+
+    async def fetch_search_results(self, query: str) -> List[Dict]:
+        """Fetch top 10 search results using Google Custom Search API"""
+        async with aiohttp.ClientSession() as session:
+            url = "https://www.googleapis.com/customsearch/v1"
+            params = {
+                "key": self.google_api_key,
+                "cx": self.google_cx,
+                "q": query,
+                "num": 10  # Fetch top 10 results
+            }
+            
+            try:
+                async with session.get(url, params=params) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        results = []
+                        for item in data.get("items", []):
+                            if "link" in item and not any(x in item["link"].lower() for x in [".pdf", ".doc", ".docx"]):
+                                results.append({
+                                    "title": item.get("title", ""),
+                                    "link": item["link"],
+                                    "snippet": item.get("snippet", "")
+                                })
+                        return results[:10]  # Ensure we only take top 10
+                    else:
+                        console.print(f"[red]Error: {response.status} - {await response.text()}[/red]")
+                        return []
+            except Exception as e:
+                console.print(f"[red]Error fetching search results: {str(e)}[/red]")
+                return []
+
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+    def extract_content_with_retry(self, url: str) -> Optional[Dict]:
+        """Extract content from a URL with retry mechanism"""
+        try:
+            with sync_playwright() as p:
+                browser = p.chromium.launch(headless=True)
+                context = browser.new_context(
+                    viewport={'width': 1920, 'height': 1080},
+                    user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                )
+                
+                page = context.new_page()
+                page.set_default_timeout(25000)  # 10 second timeout
+                
+                page.goto(url)
+                page.wait_for_load_state('networkidle', timeout=20000)
+                
+                # Extract content
+                content = page.content()
+                soup = BeautifulSoup(content, 'lxml')
+                
+                # Clean up content
+                for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside']):
+                    element.decompose()
+                
+                # Get main content
+                main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': ['content', 'main']})
+                if not main_content:
+                    main_content = soup.find('body')
+                
+                # Convert to markdown-like text
+                clean_text = self.html_converter.handle(str(main_content))
+                
+                browser.close()
+                
+                return {
+                    "url": url,
+                    "title": soup.title.string if soup.title else "No title",
+                    "content": clean_text.strip()
+                }
+                
+        except Exception as e:
+            console.print(f"[yellow]Warning: Failed to extract from {url}: {str(e)}[/yellow]")
+            return None
+
+    def process_url(self, url: str) -> Optional[Dict]:
+        """Process a single URL with progress tracking"""
+        try:
+            return self.extract_content_with_retry(url)
+        except Exception as e:
+            console.print(f"[red]Failed to process {url}: {str(e)}[/red]")
+            return None
+
+    async def process_urls_concurrent(self, urls: List[str]) -> List[Dict]:
+        """Process multiple URLs concurrently using ThreadPoolExecutor"""
+        successful_results = []
+        
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TimeRemainingColumn(),
+        ) as progress:
+            task = progress.add_task("Processing websites...", total=len(urls))
+            
+            with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
+                future_to_url = {executor.submit(self.process_url, url): url for url in urls}
+                
+                for future in as_completed(future_to_url):
+                    url = future_to_url[future]
+                    try:
+                        result = future.result()
+                        if result:
+                            successful_results.append(result)
+                    except Exception as e:
+                        console.print(f"[red]Error processing {url}: {str(e)}[/red]")
+                    finally:
+                        progress.advance(task)
+        
+        return successful_results
+
+    async def summarize_with_gemini(self, extracted_data: List[Dict], query: str) -> str:
+        """Generate summary using Gemini API"""
+        genai.configure(api_key=self.gemini_api_key)
+        
+        # Format content for summarization
+        formatted_content = "# Source Materials:\n\n"
+        for i, item in enumerate(extracted_data, 1):
+            formatted_content += f"""
+### Source {i}: {item['title']}
+URL: {item['url']}
+
+{item['content'][:2000]}  # Limit content length per source
+
+---
+"""
+        
+        prompt = f"""
+Analyze and summarize the following content about: "{query}"
+
+Create a detailed summary with these sections:
+1. Key Findings (2-3 paragraphs)
+2. Important Details (bullet points)
+3. Sources (numbered list)
+
+Focus on accuracy, clarity, and completeness.
+Present conflicting information if found.
+Use proper markdown formatting.
+
+Content to analyze:
+{formatted_content}
+"""
+
+        model = genai.GenerativeModel(
+            model_name="gemini-2.0-flash-exp",
+            generation_config={
+                "temperature": 0.7,
+                "top_p": 0.8,
+                "top_k": 40,
+                "max_output_tokens": 4096,
+            }
+        )
+
+        response = await asyncio.to_thread(
+            lambda: model.generate_content(prompt).text
+        )
+        
+        return response
+
+    async def search(self, query: str) -> str:
+        """Main search function with timing"""
+        start_time = time.time()
+        
+        console.print(f"\n[bold cyan]Searching for: {query}[/bold cyan]\n")
+        
+        # Fetch search results
+        search_results = await self.fetch_search_results(query)
+        if not search_results:
+            return "No search results found."
+        
+        # Extract URLs
+        urls = [result["link"] for result in search_results]
+        
+        # Process URLs concurrently
+        extracted_data = await self.process_urls_concurrent(urls)
+        
+        # Generate summary
+        with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
+            task = progress.add_task("[cyan]Generating summary...", total=None)
+            summary = await self.summarize_with_gemini(extracted_data, query)
+            progress.update(task, completed=True)
+        
+        # Save results
+        results = {
+            "query": query,
+            "search_results": search_results,
+            "extracted_data": extracted_data,
+            "summary": summary
+        }
+        
+        with open(os.path.join(self.outputs_dir, "search_results.json"), "w", encoding="utf-8") as f:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+        
+        end_time = time.time()
+        execution_time = end_time - start_time
+        
+        # Print results
+        console.print("\n[bold green]====== Search Summary ======[/bold green]\n")
+        console.print(summary)
+        console.print("\n[bold green]========================[/bold green]")
+        console.print(f"\n[bold cyan]Execution time: {execution_time:.2f} seconds[/bold cyan]\n")
+        
+        return summary
+
+def search(query: str) -> str:
+    """Synchronous wrapper for the async search function"""
+    checker = WebsiteChecker()
+    return asyncio.run(checker.search(query))
+
+# search_tool_schema = functions_to_openai_tools([search])
+# # tools = functions_to_openai_tools([search, get_weather])
+
+# # Print the generated schemas
+# print(json.dumps(tools, indent=2))
+# if __name__ == "__main__":
+#     query = input("Enter your search query: ")
+#     result = search(query)
+
+# search("who won elections 2024 us")