diff --git a/examples/news_aggregator_summarizer.py b/examples/news_aggregator_summarizer.py new file mode 100644 index 00000000..ce55e956 --- /dev/null +++ b/examples/news_aggregator_summarizer.py @@ -0,0 +1,107 @@ +"""Simple news aggregator and summarizer example. + +This script fetches the top Hacker News headlines and generates short +summaries for the first two articles. Results are printed to the console +and also written to ``news_summaries.txt``. +""" + +import httpx +import re +from html.parser import HTMLParser +from swarms import Agent +from swarms.prompts.summaries_prompts import SUMMARIZE_PROMPT + + +def fetch_hackernews_headlines(limit: int = 5): + """Fetch top headlines from Hacker News using its public API.""" + try: + ids = httpx.get( + "https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10 + ).json() + except Exception: + return [] + headlines = [] + for story_id in ids[:limit]: + try: + item = httpx.get( + f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json", + timeout=10, + ).json() + except Exception: + continue + headlines.append({"title": item.get("title", "No title"), "url": item.get("url", "")}) + return headlines + + +class _ParagraphExtractor(HTMLParser): + def __init__(self): + super().__init__() + self.in_p = False + self.text_parts = [] + + def handle_starttag(self, tag, attrs): + if tag == "p": + self.in_p = True + + def handle_endtag(self, tag): + if tag == "p": + self.in_p = False + + def handle_data(self, data): + if self.in_p: + self.text_parts.append(data.strip()) + + +def _extract_paragraph_text(html: str) -> str: + parser = _ParagraphExtractor() + parser.feed(html) + parser.close() + return " ".join(t for t in parser.text_parts if t) + + +def fetch_article_content(url: str) -> str: + """Retrieve article content from a URL using httpx.""" + try: + res = httpx.get(url, timeout=10) + res.raise_for_status() + except Exception: + return "" + text = _extract_paragraph_text(res.text) + if not text: + text = re.sub("<[^>]+>", " ", res.text) + return text.strip() + + +summarizer = Agent( + agent_name="News-Summarizer", + system_prompt="You summarize news articles succinctly.", + max_loops=1, + model_name="gpt-4o-mini", + output_type="final", +) + + +def summarize_article(text: str) -> str: + prompt = f"{SUMMARIZE_PROMPT}\n\n{text}" + return summarizer.run(prompt) + + +if __name__ == "__main__": + headlines = fetch_hackernews_headlines() + print("Top Headlines:\n") + for idx, headline in enumerate(headlines, 1): + print(f"{idx}. {headline['title']}") + + summaries = [] + for article in headlines: + content = fetch_article_content(article["url"]) + summary = summarize_article(content) + summaries.append({"title": article["title"], "summary": summary}) + + print("\nArticle Summaries:\n") + for s in summaries: + print(f"{s['title']}\n{s['summary']}\n") + + with open("news_summaries.txt", "w") as f: + for s in summaries: + f.write(f"{s['title']}\n{s['summary']}\n\n")