Update news aggregator summarizer to summarize all headlines

6 months ago · 1e5f2a288e
parent 95d3672ce2
commit 1e5f2a288e
1 changed files with 59 additions and 17 deletions
--- a/examples/news_aggregator_summarizer.py
+++ b/examples/news_aggregator_summarizer.py
@ -1,33 +1,74 @@
-import requests
+"""Simple news aggregator and summarizer example.
-from bs4 import BeautifulSoup
+
 This script fetches the top Hacker News headlines and generates short
 summaries for the first two articles.  Results are printed to the console
 and also written to ``news_summaries.txt``.
 """
 import httpx
 import re
 from html.parser import HTMLParser
 from swarms import Agent
 from swarms.prompts.summaries_prompts import SUMMARIZE_PROMPT
 def fetch_hackernews_headlines(limit: int = 5):
-    """Fetch top headlines from Hacker News."""
+    """Fetch top headlines from Hacker News using its public API."""
-    resp = requests.get("https://news.ycombinator.com")
+    try:
-    resp.raise_for_status()
+        ids = httpx.get(
-    soup = BeautifulSoup(resp.text, "html.parser")
+            "https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10
        ).json()
    except Exception:
        return []
    headlines = []
-    for item in soup.select("tr.athing")[:limit]:
+    for story_id in ids[:limit]:
-        link = item.select_one("span.titleline a")
+        try:
-        if link:
+            item = httpx.get(
-            headlines.append({"title": link.get_text(), "url": link["href"]})
+                f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json",
                timeout=10,
            ).json()
        except Exception:
            continue
        headlines.append({"title": item.get("title", "No title"), "url": item.get("url", "")})
    return headlines
 class _ParagraphExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_p = False
        self.text_parts = []
    def handle_starttag(self, tag, attrs):
        if tag == "p":
            self.in_p = True
    def handle_endtag(self, tag):
        if tag == "p":
            self.in_p = False
    def handle_data(self, data):
        if self.in_p:
            self.text_parts.append(data.strip())
 def _extract_paragraph_text(html: str) -> str:
    parser = _ParagraphExtractor()
    parser.feed(html)
    parser.close()
    return " ".join(t for t in parser.text_parts if t)
 def fetch_article_content(url: str) -> str:
-    """Pull text content from an article URL."""
+    """Retrieve article content from a URL using httpx."""
    try:
-        res = requests.get(url, timeout=10)
+        res = httpx.get(url, timeout=10)
        res.raise_for_status()
    except Exception:
        return ""
-    soup = BeautifulSoup(res.text, "html.parser")
+    text = _extract_paragraph_text(res.text)
-    for tag in soup(["script", "style", "nav", "header", "footer"]):
+    if not text:
-        tag.decompose()
+        text = re.sub("<[^>]+>", " ", res.text)
    text = " ".join(p.get_text() for p in soup.find_all("p"))
    return text.strip()
@ -36,6 +77,7 @@ summarizer = Agent(
    system_prompt="You summarize news articles succinctly.",
    max_loops=1,
    model_name="gpt-4o-mini",
    output_type="final",
 )
@ -51,7 +93,7 @@ if __name__ == "__main__":
        print(f"{idx}. {headline['title']}")
    summaries = []
-    for article in headlines[:2]:
+    for article in headlines:
        content = fetch_article_content(article["url"])
        summary = summarize_article(content)
        summaries.append({"title": article["title"], "summary": summary})