parent
9e2fda0a62
commit
4480ead6aa
@ -0,0 +1,66 @@
|
||||
"""
|
||||
Paper Swarm
|
||||
1. Scrape https://huggingface.co/papers for all papers, by search for all links on the paper with a /papers/, then clicks, gets the header, and then the abstract.
|
||||
and various links and then adds them to a txt file for each paper on https://huggingface.co/papers
|
||||
|
||||
2. Feed prompts iteratively into Anthropic for summarizations + value score on impact, reliability, and novel, and other paper ranking mechanisms
|
||||
|
||||
3. Store papers in a database with metadata. Agents can use retrieval
|
||||
|
||||
4. Discord Bot // Twitter Bot
|
||||
"""
|
||||
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
class Paper:
|
||||
def __init__(self, title, date, authors, abstract):
|
||||
self.title = title
|
||||
self.date = date
|
||||
self.authors = authors
|
||||
self.abstract = abstract
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
|
||||
def get_paper_links(self):
|
||||
response = requests.get(self.url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
links = [a['href'] for a in soup.find_all('a', href=True) if '/papers/' in a['href']]
|
||||
return links
|
||||
|
||||
def get_paper_details(self, link):
|
||||
response = requests.get(self.url + link)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
title = soup.find('h1').text
|
||||
date_tag = soup.find('time')
|
||||
date = date_tag.text if date_tag else 'Unknown'
|
||||
authors = [author.text for author in soup.find_all('span', class_='author')]
|
||||
abstract_tag = soup.find('div', class_='abstract')
|
||||
abstract = abstract_tag.text if abstract_tag else 'Abstract not found'
|
||||
return Paper(title, date, authors, abstract)
|
||||
|
||||
class FileWriter:
|
||||
def __init__(self, directory):
|
||||
self.directory = directory
|
||||
|
||||
def write_paper(self, paper):
|
||||
with open(os.path.join(self.directory, paper.title + '.txt'), 'w') as f:
|
||||
f.write(f"h1: {paper.title}\n")
|
||||
f.write(f"Published on {paper.date}\n")
|
||||
f.write("Authors:\n")
|
||||
for author in paper.authors:
|
||||
f.write(f"{author}\n")
|
||||
f.write("Abstract\n")
|
||||
f.write(paper.abstract)
|
||||
|
||||
scraper = Scraper('https://huggingface.co/papers')
|
||||
file_writer = FileWriter('images')
|
||||
|
||||
links = scraper.get_paper_links()
|
||||
for link in links:
|
||||
paper = scraper.get_paper_details(link)
|
||||
file_writer.write_paper(paper)
|
@ -0,0 +1,5 @@
|
||||
h1: 404
|
||||
Published on Unknown
|
||||
Authors:
|
||||
Abstract
|
||||
Abstract not found
|
@ -1,2 +0,0 @@
|
||||
# Agents
|
||||
* Agents are the fundamental building blocks of a swarm, they are indivi
|
Loading…
Reference in new issue