|
|
|
"""
|
|
|
|
Paper Swarm
|
|
|
|
1. Scrape https://huggingface.co/papers for all papers, by search for all links on the paper with a /papers/, then clicks, gets the header, and then the abstract.
|
|
|
|
and various links and then adds them to a txt file for each paper on https://huggingface.co/papers
|
|
|
|
|
|
|
|
2. Feed prompts iteratively into Anthropic for summarizations + value score on impact, reliability, and novel, and other paper ranking mechanisms
|
|
|
|
|
|
|
|
3. Store papers in a database with metadata. Agents can use retrieval
|
|
|
|
|
|
|
|
4. Discord Bot // Twitter Bot
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
class Paper:
|
|
|
|
def __init__(self, title, date, authors, abstract):
|
|
|
|
self.title = title
|
|
|
|
self.date = date
|
|
|
|
self.authors = authors
|
|
|
|
self.abstract = abstract
|
|
|
|
|
|
|
|
|
|
|
|
class Scraper:
|
|
|
|
def __init__(self, url):
|
|
|
|
self.url = url
|
|
|
|
|
|
|
|
def get_paper_links(self):
|
|
|
|
response = requests.get(self.url)
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
links = [
|
|
|
|
a["href"] for a in soup.find_all("a", href=True) if "/papers/" in a["href"]
|
|
|
|
]
|
|
|
|
return links
|
|
|
|
|
|
|
|
def get_paper_details(self, link):
|
|
|
|
response = requests.get(self.url + link)
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
title = soup.find("h1").text
|
|
|
|
date_tag = soup.find("time")
|
|
|
|
date = date_tag.text if date_tag else "Unknown"
|
|
|
|
authors = [author.text for author in soup.find_all("span", class_="author")]
|
|
|
|
abstract_tag = soup.find("div", class_="abstract")
|
|
|
|
abstract = abstract_tag.text if abstract_tag else "Abstract not found"
|
|
|
|
return Paper(title, date, authors, abstract)
|
|
|
|
|
|
|
|
|
|
|
|
class FileWriter:
|
|
|
|
def __init__(self, directory):
|
|
|
|
self.directory = directory
|
|
|
|
|
|
|
|
def write_paper(self, paper):
|
|
|
|
with open(os.path.join(self.directory, paper.title + ".txt"), "w") as f:
|
|
|
|
f.write(f"h1: {paper.title}\n")
|
|
|
|
f.write(f"Published on {paper.date}\n")
|
|
|
|
f.write("Authors:\n")
|
|
|
|
for author in paper.authors:
|
|
|
|
f.write(f"{author}\n")
|
|
|
|
f.write("Abstract\n")
|
|
|
|
f.write(paper.abstract)
|
|
|
|
|
|
|
|
|
|
|
|
scraper = Scraper("https://huggingface.co/papers")
|
|
|
|
file_writer = FileWriter("images")
|
|
|
|
|
|
|
|
links = scraper.get_paper_links()
|
|
|
|
for link in links:
|
|
|
|
paper = scraper.get_paper_details(link)
|
|
|
|
file_writer.write_paper(paper)
|