Merge pull request #863 from ascender1729/news_aggregator_summarizer

Solve issue #862: News Aggregator Summarizer
6 months ago · 47890b8ac0
parent c634ef72d5 1e5f2a288e
commit 47890b8ac0
1 changed files with 107 additions and 0 deletions
--- a/examples/news_aggregator_summarizer.py
+++ b/examples/news_aggregator_summarizer.py
@ -0,0 +1,107 @@
 """Simple news aggregator and summarizer example.
 This script fetches the top Hacker News headlines and generates short
 summaries for the first two articles.  Results are printed to the console
 and also written to ``news_summaries.txt``.
 """
 import httpx
 import re
 from html.parser import HTMLParser
 from swarms import Agent
 from swarms.prompts.summaries_prompts import SUMMARIZE_PROMPT
 def fetch_hackernews_headlines(limit: int = 5):
    """Fetch top headlines from Hacker News using its public API."""
    try:
        ids = httpx.get(
            "https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10
        ).json()
    except Exception:
        return []
    headlines = []
    for story_id in ids[:limit]:
        try:
            item = httpx.get(
                f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json",
                timeout=10,
            ).json()
        except Exception:
            continue
        headlines.append({"title": item.get("title", "No title"), "url": item.get("url", "")})
    return headlines
 class _ParagraphExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_p = False
        self.text_parts = []
    def handle_starttag(self, tag, attrs):
        if tag == "p":
            self.in_p = True
    def handle_endtag(self, tag):
        if tag == "p":
            self.in_p = False
    def handle_data(self, data):
        if self.in_p:
            self.text_parts.append(data.strip())
 def _extract_paragraph_text(html: str) -> str:
    parser = _ParagraphExtractor()
    parser.feed(html)
    parser.close()
    return " ".join(t for t in parser.text_parts if t)
 def fetch_article_content(url: str) -> str:
    """Retrieve article content from a URL using httpx."""
    try:
        res = httpx.get(url, timeout=10)
        res.raise_for_status()
    except Exception:
        return ""
    text = _extract_paragraph_text(res.text)
    if not text:
        text = re.sub("<[^>]+>", " ", res.text)
    return text.strip()
 summarizer = Agent(
    agent_name="News-Summarizer",
    system_prompt="You summarize news articles succinctly.",
    max_loops=1,
    model_name="gpt-4o-mini",
    output_type="final",
 )
 def summarize_article(text: str) -> str:
    prompt = f"{SUMMARIZE_PROMPT}\n\n{text}"
    return summarizer.run(prompt)
 if __name__ == "__main__":
    headlines = fetch_hackernews_headlines()
    print("Top Headlines:\n")
    for idx, headline in enumerate(headlines, 1):
        print(f"{idx}. {headline['title']}")
    summaries = []
    for article in headlines:
        content = fetch_article_content(article["url"])
        summary = summarize_article(content)
        summaries.append({"title": article["title"], "summary": summary})
    print("\nArticle Summaries:\n")
    for s in summaries:
        print(f"{s['title']}\n{s['summary']}\n")
    with open("news_summaries.txt", "w") as f:
        for s in summaries:
            f.write(f"{s['title']}\n{s['summary']}\n\n")