From 1e5f2a288e29e1d19ddb2c20550120aaf2b816c0 Mon Sep 17 00:00:00 2001
From: Pavan Kumar <66913595+ascender1729@users.noreply.github.com>
Date: Wed, 4 Jun 2025 17:26:31 +0000
Subject: [PATCH] Update news aggregator summarizer to summarize all headlines

---
 examples/news_aggregator_summarizer.py | 76 ++++++++++++++++++++------
 1 file changed, 59 insertions(+), 17 deletions(-)

diff --git a/examples/news_aggregator_summarizer.py b/examples/news_aggregator_summarizer.py
index 864e7481..ce55e956 100644
--- a/examples/news_aggregator_summarizer.py
+++ b/examples/news_aggregator_summarizer.py
@@ -1,33 +1,74 @@
-import requests
-from bs4 import BeautifulSoup
+"""Simple news aggregator and summarizer example.
+
+This script fetches the top Hacker News headlines and generates short
+summaries for the first two articles.  Results are printed to the console
+and also written to ``news_summaries.txt``.
+"""
+
+import httpx
+import re
+from html.parser import HTMLParser
 from swarms import Agent
 from swarms.prompts.summaries_prompts import SUMMARIZE_PROMPT
 
 
 def fetch_hackernews_headlines(limit: int = 5):
-    """Fetch top headlines from Hacker News."""
-    resp = requests.get("https://news.ycombinator.com")
-    resp.raise_for_status()
-    soup = BeautifulSoup(resp.text, "html.parser")
+    """Fetch top headlines from Hacker News using its public API."""
+    try:
+        ids = httpx.get(
+            "https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10
+        ).json()
+    except Exception:
+        return []
     headlines = []
-    for item in soup.select("tr.athing")[:limit]:
-        link = item.select_one("span.titleline a")
-        if link:
-            headlines.append({"title": link.get_text(), "url": link["href"]})
+    for story_id in ids[:limit]:
+        try:
+            item = httpx.get(
+                f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json",
+                timeout=10,
+            ).json()
+        except Exception:
+            continue
+        headlines.append({"title": item.get("title", "No title"), "url": item.get("url", "")})
     return headlines
 
 
+class _ParagraphExtractor(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.in_p = False
+        self.text_parts = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "p":
+            self.in_p = True
+
+    def handle_endtag(self, tag):
+        if tag == "p":
+            self.in_p = False
+
+    def handle_data(self, data):
+        if self.in_p:
+            self.text_parts.append(data.strip())
+
+
+def _extract_paragraph_text(html: str) -> str:
+    parser = _ParagraphExtractor()
+    parser.feed(html)
+    parser.close()
+    return " ".join(t for t in parser.text_parts if t)
+
+
 def fetch_article_content(url: str) -> str:
-    """Pull text content from an article URL."""
+    """Retrieve article content from a URL using httpx."""
     try:
-        res = requests.get(url, timeout=10)
+        res = httpx.get(url, timeout=10)
         res.raise_for_status()
     except Exception:
         return ""
-    soup = BeautifulSoup(res.text, "html.parser")
-    for tag in soup(["script", "style", "nav", "header", "footer"]):
-        tag.decompose()
-    text = " ".join(p.get_text() for p in soup.find_all("p"))
+    text = _extract_paragraph_text(res.text)
+    if not text:
+        text = re.sub("<[^>]+>", " ", res.text)
     return text.strip()
 
 
@@ -36,6 +77,7 @@ summarizer = Agent(
     system_prompt="You summarize news articles succinctly.",
     max_loops=1,
     model_name="gpt-4o-mini",
+    output_type="final",
 )
 
 
@@ -51,7 +93,7 @@ if __name__ == "__main__":
         print(f"{idx}. {headline['title']}")
 
     summaries = []
-    for article in headlines[:2]:
+    for article in headlines:
         content = fetch_article_content(article["url"])
         summary = summarize_article(content)
         summaries.append({"title": article["title"], "summary": summary})