Update news aggregator summarizer to summarize all headlines

6 months ago · 1e5f2a288e
parent 95d3672ce2
commit 1e5f2a288e
1 changed files with 59 additions and 17 deletions
--- a/examples/news_aggregator_summarizer.py
+++ b/examples/news_aggregator_summarizer.py
@ -1,33 +1,74 @@
-import requests
-from bs4 import BeautifulSoup
+"""Simple news aggregator and summarizer example.
+
+This script fetches the top Hacker News headlines and generates short
+summaries for the first two articles.  Results are printed to the console
+and also written to ``news_summaries.txt``.
+"""
+
+import httpx
+import re
+from html.parser import HTMLParser
 from swarms import Agent
 from swarms.prompts.summaries_prompts import SUMMARIZE_PROMPT


 def fetch_hackernews_headlines(limit: int = 5):
-    """Fetch top headlines from Hacker News."""
-    resp = requests.get("https://news.ycombinator.com")
-    resp.raise_for_status()
-    soup = BeautifulSoup(resp.text, "html.parser")
+    """Fetch top headlines from Hacker News using its public API."""
+    try:
+        ids = httpx.get(
+            "https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10
+        ).json()
+    except Exception:
+        return []
    headlines = []
-    for item in soup.select("tr.athing")[:limit]:
-        link = item.select_one("span.titleline a")
-        if link:
-            headlines.append({"title": link.get_text(), "url": link["href"]})
+    for story_id in ids[:limit]:
+        try:
+            item = httpx.get(
+                f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json",
+                timeout=10,
+            ).json()
+        except Exception:
+            continue
+        headlines.append({"title": item.get("title", "No title"), "url": item.get("url", "")})
    return headlines


+class _ParagraphExtractor(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.in_p = False
+        self.text_parts = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "p":
+            self.in_p = True
+
+    def handle_endtag(self, tag):
+        if tag == "p":
+            self.in_p = False
+
+    def handle_data(self, data):
+        if self.in_p:
+            self.text_parts.append(data.strip())
+
+
+def _extract_paragraph_text(html: str) -> str:
+    parser = _ParagraphExtractor()
+    parser.feed(html)
+    parser.close()
+    return " ".join(t for t in parser.text_parts if t)
+
+
 def fetch_article_content(url: str) -> str:
-    """Pull text content from an article URL."""
+    """Retrieve article content from a URL using httpx."""
    try:
-        res = requests.get(url, timeout=10)
+        res = httpx.get(url, timeout=10)
        res.raise_for_status()
    except Exception:
        return ""
-    soup = BeautifulSoup(res.text, "html.parser")
-    for tag in soup(["script", "style", "nav", "header", "footer"]):
-        tag.decompose()
-    text = " ".join(p.get_text() for p in soup.find_all("p"))
+    text = _extract_paragraph_text(res.text)
+    if not text:
+        text = re.sub("<[^>]+>", " ", res.text)
    return text.strip()


@ -36,6 +77,7 @@ summarizer = Agent(
    system_prompt="You summarize news articles succinctly.",
    max_loops=1,
    model_name="gpt-4o-mini",
+    output_type="final",
 )


@ -51,7 +93,7 @@ if __name__ == "__main__":
        print(f"{idx}. {headline['title']}")

    summaries = []
-    for article in headlines[:2]:
+    for article in headlines:
        content = fetch_article_content(article["url"])
        summary = summarize_article(content)
        summaries.append({"title": article["title"], "summary": summary})