parent
12109e3857
commit
b584f4d4f5
@ -0,0 +1,258 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import google.generativeai as genai
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
|
||||||
|
import html2text
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
import time
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
class WebsiteChecker:
|
||||||
|
def __init__(self):
|
||||||
|
self.google_api_key = os.getenv("GOOGLE_API_KEY")
|
||||||
|
self.google_cx = os.getenv("GOOGLE_CX")
|
||||||
|
self.gemini_api_key = os.getenv("GEMINI_API_KEY")
|
||||||
|
self.outputs_dir = "outputs"
|
||||||
|
os.makedirs(self.outputs_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Initialize html2text
|
||||||
|
self.html_converter = html2text.HTML2Text()
|
||||||
|
self.html_converter.ignore_links = True
|
||||||
|
self.html_converter.ignore_images = True
|
||||||
|
self.html_converter.ignore_emphasis = True
|
||||||
|
|
||||||
|
# Configure retry settings
|
||||||
|
self.max_retries = 3
|
||||||
|
self.max_threads = 10 # Concurrent threads
|
||||||
|
self.timeout = 15 # Seconds
|
||||||
|
|
||||||
|
async def fetch_search_results(self, query: str) -> List[Dict]:
|
||||||
|
"""Fetch top 10 search results using Google Custom Search API"""
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
url = "https://www.googleapis.com/customsearch/v1"
|
||||||
|
params = {
|
||||||
|
"key": self.google_api_key,
|
||||||
|
"cx": self.google_cx,
|
||||||
|
"q": query,
|
||||||
|
"num": 10 # Fetch top 10 results
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with session.get(url, params=params) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
data = await response.json()
|
||||||
|
results = []
|
||||||
|
for item in data.get("items", []):
|
||||||
|
if "link" in item and not any(x in item["link"].lower() for x in [".pdf", ".doc", ".docx"]):
|
||||||
|
results.append({
|
||||||
|
"title": item.get("title", ""),
|
||||||
|
"link": item["link"],
|
||||||
|
"snippet": item.get("snippet", "")
|
||||||
|
})
|
||||||
|
return results[:10] # Ensure we only take top 10
|
||||||
|
else:
|
||||||
|
console.print(f"[red]Error: {response.status} - {await response.text()}[/red]")
|
||||||
|
return []
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error fetching search results: {str(e)}[/red]")
|
||||||
|
return []
|
||||||
|
|
||||||
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||||
|
def extract_content_with_retry(self, url: str) -> Optional[Dict]:
|
||||||
|
"""Extract content from a URL with retry mechanism"""
|
||||||
|
try:
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
context = browser.new_context(
|
||||||
|
viewport={'width': 1920, 'height': 1080},
|
||||||
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||||
|
)
|
||||||
|
|
||||||
|
page = context.new_page()
|
||||||
|
page.set_default_timeout(25000) # 10 second timeout
|
||||||
|
|
||||||
|
page.goto(url)
|
||||||
|
page.wait_for_load_state('networkidle', timeout=20000)
|
||||||
|
|
||||||
|
# Extract content
|
||||||
|
content = page.content()
|
||||||
|
soup = BeautifulSoup(content, 'lxml')
|
||||||
|
|
||||||
|
# Clean up content
|
||||||
|
for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside']):
|
||||||
|
element.decompose()
|
||||||
|
|
||||||
|
# Get main content
|
||||||
|
main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': ['content', 'main']})
|
||||||
|
if not main_content:
|
||||||
|
main_content = soup.find('body')
|
||||||
|
|
||||||
|
# Convert to markdown-like text
|
||||||
|
clean_text = self.html_converter.handle(str(main_content))
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"title": soup.title.string if soup.title else "No title",
|
||||||
|
"content": clean_text.strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[yellow]Warning: Failed to extract from {url}: {str(e)}[/yellow]")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_url(self, url: str) -> Optional[Dict]:
|
||||||
|
"""Process a single URL with progress tracking"""
|
||||||
|
try:
|
||||||
|
return self.extract_content_with_retry(url)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Failed to process {url}: {str(e)}[/red]")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def process_urls_concurrent(self, urls: List[str]) -> List[Dict]:
|
||||||
|
"""Process multiple URLs concurrently using ThreadPoolExecutor"""
|
||||||
|
successful_results = []
|
||||||
|
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Processing websites...", total=len(urls))
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
|
||||||
|
future_to_url = {executor.submit(self.process_url, url): url for url in urls}
|
||||||
|
|
||||||
|
for future in as_completed(future_to_url):
|
||||||
|
url = future_to_url[future]
|
||||||
|
try:
|
||||||
|
result = future.result()
|
||||||
|
if result:
|
||||||
|
successful_results.append(result)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error processing {url}: {str(e)}[/red]")
|
||||||
|
finally:
|
||||||
|
progress.advance(task)
|
||||||
|
|
||||||
|
return successful_results
|
||||||
|
|
||||||
|
async def summarize_with_gemini(self, extracted_data: List[Dict], query: str) -> str:
|
||||||
|
"""Generate summary using Gemini API"""
|
||||||
|
genai.configure(api_key=self.gemini_api_key)
|
||||||
|
|
||||||
|
# Format content for summarization
|
||||||
|
formatted_content = "# Source Materials:\n\n"
|
||||||
|
for i, item in enumerate(extracted_data, 1):
|
||||||
|
formatted_content += f"""
|
||||||
|
### Source {i}: {item['title']}
|
||||||
|
URL: {item['url']}
|
||||||
|
|
||||||
|
{item['content'][:2000]} # Limit content length per source
|
||||||
|
|
||||||
|
---
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
Analyze and summarize the following content about: "{query}"
|
||||||
|
|
||||||
|
Create a detailed summary with these sections:
|
||||||
|
1. Key Findings (2-3 paragraphs)
|
||||||
|
2. Important Details (bullet points)
|
||||||
|
3. Sources (numbered list)
|
||||||
|
|
||||||
|
Focus on accuracy, clarity, and completeness.
|
||||||
|
Present conflicting information if found.
|
||||||
|
Use proper markdown formatting.
|
||||||
|
|
||||||
|
Content to analyze:
|
||||||
|
{formatted_content}
|
||||||
|
"""
|
||||||
|
|
||||||
|
model = genai.GenerativeModel(
|
||||||
|
model_name="gemini-2.0-flash-exp",
|
||||||
|
generation_config={
|
||||||
|
"temperature": 0.7,
|
||||||
|
"top_p": 0.8,
|
||||||
|
"top_k": 40,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await asyncio.to_thread(
|
||||||
|
lambda: model.generate_content(prompt).text
|
||||||
|
)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
async def search(self, query: str) -> str:
|
||||||
|
"""Main search function with timing"""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
console.print(f"\n[bold cyan]Searching for: {query}[/bold cyan]\n")
|
||||||
|
|
||||||
|
# Fetch search results
|
||||||
|
search_results = await self.fetch_search_results(query)
|
||||||
|
if not search_results:
|
||||||
|
return "No search results found."
|
||||||
|
|
||||||
|
# Extract URLs
|
||||||
|
urls = [result["link"] for result in search_results]
|
||||||
|
|
||||||
|
# Process URLs concurrently
|
||||||
|
extracted_data = await self.process_urls_concurrent(urls)
|
||||||
|
|
||||||
|
# Generate summary
|
||||||
|
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
|
||||||
|
task = progress.add_task("[cyan]Generating summary...", total=None)
|
||||||
|
summary = await self.summarize_with_gemini(extracted_data, query)
|
||||||
|
progress.update(task, completed=True)
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
results = {
|
||||||
|
"query": query,
|
||||||
|
"search_results": search_results,
|
||||||
|
"extracted_data": extracted_data,
|
||||||
|
"summary": summary
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(os.path.join(self.outputs_dir, "search_results.json"), "w", encoding="utf-8") as f:
|
||||||
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
execution_time = end_time - start_time
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
console.print("\n[bold green]====== Search Summary ======[/bold green]\n")
|
||||||
|
console.print(summary)
|
||||||
|
console.print("\n[bold green]========================[/bold green]")
|
||||||
|
console.print(f"\n[bold cyan]Execution time: {execution_time:.2f} seconds[/bold cyan]\n")
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
def search(query: str) -> str:
|
||||||
|
"""Synchronous wrapper for the async search function"""
|
||||||
|
checker = WebsiteChecker()
|
||||||
|
return asyncio.run(checker.search(query))
|
||||||
|
|
||||||
|
# search_tool_schema = functions_to_openai_tools([search])
|
||||||
|
# # tools = functions_to_openai_tools([search, get_weather])
|
||||||
|
|
||||||
|
# # Print the generated schemas
|
||||||
|
# print(json.dumps(tools, indent=2))
|
||||||
|
# if __name__ == "__main__":
|
||||||
|
# query = input("Enter your search query: ")
|
||||||
|
# result = search(query)
|
||||||
|
|
||||||
|
# search("who won elections 2024 us")
|
Loading…
Reference in new issue