Update news aggregator summarizer to summarize all headlines

pull/863/head
Pavan Kumar 4 days ago
parent 95d3672ce2
commit 1e5f2a288e

@ -1,33 +1,74 @@
import requests
from bs4 import BeautifulSoup
"""Simple news aggregator and summarizer example.
This script fetches the top Hacker News headlines and generates short
summaries for the first two articles. Results are printed to the console
and also written to ``news_summaries.txt``.
"""
import httpx
import re
from html.parser import HTMLParser
from swarms import Agent
from swarms.prompts.summaries_prompts import SUMMARIZE_PROMPT
def fetch_hackernews_headlines(limit: int = 5):
"""Fetch top headlines from Hacker News."""
resp = requests.get("https://news.ycombinator.com")
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
"""Fetch top headlines from Hacker News using its public API."""
try:
ids = httpx.get(
"https://hacker-news.firebaseio.com/v0/topstories.json", timeout=10
).json()
except Exception:
return []
headlines = []
for item in soup.select("tr.athing")[:limit]:
link = item.select_one("span.titleline a")
if link:
headlines.append({"title": link.get_text(), "url": link["href"]})
for story_id in ids[:limit]:
try:
item = httpx.get(
f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json",
timeout=10,
).json()
except Exception:
continue
headlines.append({"title": item.get("title", "No title"), "url": item.get("url", "")})
return headlines
class _ParagraphExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.in_p = False
self.text_parts = []
def handle_starttag(self, tag, attrs):
if tag == "p":
self.in_p = True
def handle_endtag(self, tag):
if tag == "p":
self.in_p = False
def handle_data(self, data):
if self.in_p:
self.text_parts.append(data.strip())
def _extract_paragraph_text(html: str) -> str:
parser = _ParagraphExtractor()
parser.feed(html)
parser.close()
return " ".join(t for t in parser.text_parts if t)
def fetch_article_content(url: str) -> str:
"""Pull text content from an article URL."""
"""Retrieve article content from a URL using httpx."""
try:
res = requests.get(url, timeout=10)
res = httpx.get(url, timeout=10)
res.raise_for_status()
except Exception:
return ""
soup = BeautifulSoup(res.text, "html.parser")
for tag in soup(["script", "style", "nav", "header", "footer"]):
tag.decompose()
text = " ".join(p.get_text() for p in soup.find_all("p"))
text = _extract_paragraph_text(res.text)
if not text:
text = re.sub("<[^>]+>", " ", res.text)
return text.strip()
@ -36,6 +77,7 @@ summarizer = Agent(
system_prompt="You summarize news articles succinctly.",
max_loops=1,
model_name="gpt-4o-mini",
output_type="final",
)
@ -51,7 +93,7 @@ if __name__ == "__main__":
print(f"{idx}. {headline['title']}")
summaries = []
for article in headlines[:2]:
for article in headlines:
content = fetch_article_content(article["url"])
summary = summarize_article(content)
summaries.append({"title": article["title"], "summary": summary})

Loading…
Cancel
Save