From 1e5f2a288e29e1d19ddb2c20550120aaf2b816c0 Mon Sep 17 00:00:00 2001 From: Pavan Kumar <66913595+ascender1729@users.noreply.github.com> Date: Wed, 4 Jun 2025 17:26:31 +0000 Subject: [PATCH] Update news aggregator summarizer to summarize all headlines --- examples/news_aggregator_summarizer.py | 76 ++++++++++++++++++++------ 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/examples/news_aggregator_summarizer.py b/examples/news_aggregator_summarizer.py index 864e7481..ce55e956 100644 --- a/examples/news_aggregator_summarizer.py +++ b/examples/news_aggregator_summarizer.py @@ -1,33 +1,74 @@ -import requests -from bs4 import BeautifulSoup +"""Simple news aggregator and summarizer example. + +This script fetches the top Hacker News headlines and generates short +summaries for the first two articles. Results are printed to the console +and also written to ``news_summaries.txt``. +""" + +import httpx +import re +from html.parser import HTMLParser from swarms import Agent from swarms.prompts.summaries_prompts import SUMMARIZE_PROMPT def fetch_hackernews_headlines(limit: int = 5): - """Fetch top headlines from Hacker News.""" - resp = requests.get("https://news.ycombinator.com") - resp.raise_for_status() - soup = BeautifulSoup(resp.text, "html.parser") + """Fetch top headlines from Hacker News using its public API.""" + try: + ids = httpx.get( + "https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10 + ).json() + except Exception: + return [] headlines = [] - for item in soup.select("tr.athing")[:limit]: - link = item.select_one("span.titleline a") - if link: - headlines.append({"title": link.get_text(), "url": link["href"]}) + for story_id in ids[:limit]: + try: + item = httpx.get( + f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json", + timeout=10, + ).json() + except Exception: + continue + headlines.append({"title": item.get("title", "No title"), "url": item.get("url", "")}) return headlines +class _ParagraphExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.in_p = False + self.text_parts = [] + + def handle_starttag(self, tag, attrs): + if tag == "p": + self.in_p = True + + def handle_endtag(self, tag): + if tag == "p": + self.in_p = False + + def handle_data(self, data): + if self.in_p: + self.text_parts.append(data.strip()) + + +def _extract_paragraph_text(html: str) -> str: + parser = _ParagraphExtractor() + parser.feed(html) + parser.close() + return " ".join(t for t in parser.text_parts if t) + + def fetch_article_content(url: str) -> str: - """Pull text content from an article URL.""" + """Retrieve article content from a URL using httpx.""" try: - res = requests.get(url, timeout=10) + res = httpx.get(url, timeout=10) res.raise_for_status() except Exception: return "" - soup = BeautifulSoup(res.text, "html.parser") - for tag in soup(["script", "style", "nav", "header", "footer"]): - tag.decompose() - text = " ".join(p.get_text() for p in soup.find_all("p")) + text = _extract_paragraph_text(res.text) + if not text: + text = re.sub("<[^>]+>", " ", res.text) return text.strip() @@ -36,6 +77,7 @@ summarizer = Agent( system_prompt="You summarize news articles succinctly.", max_loops=1, model_name="gpt-4o-mini", + output_type="final", ) @@ -51,7 +93,7 @@ if __name__ == "__main__": print(f"{idx}. {headline['title']}") summaries = [] - for article in headlines[:2]: + for article in headlines: content = fetch_article_content(article["url"]) summary = summarize_article(content) summaries.append({"title": article["title"], "summary": summary})