pull/393/head
Kye 11 months ago
parent ea98cbdae8
commit 2c9bad43fb

@ -19,9 +19,7 @@ agent = Agent(
# Task
task = Task(
agent=agent,
description=(
"Download https://www.coachcamel.com/"
),
description="Download https://www.coachcamel.com/",
)
# Swarm

@ -14,4 +14,3 @@ chroma.add(text)
# Search for similar text
similar_text = chroma.query(text)

@ -1,4 +1,3 @@
import os
from dotenv import load_dotenv

@ -12,6 +12,7 @@ api_key = os.environ.get("OPENAI_API_KEY")
llm = OpenAIChat(api_key=api_key)
@tool
def search_api(query: str) -> str:
"""Search API

@ -37,8 +37,6 @@ class MultiOnAgent(AbstractLLM):
self.max_steps = max_steps
self.starting_url = starting_url
def run(self, task: str, *args, **kwargs):
"""
Runs a browsing task.

@ -1,77 +0,0 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import IO
from pypdf import PdfReader
from swarms.utils.hash import str_to_hash
@dataclass
class TextArtifact:
text: str
@dataclass
class PDFLoader:
"""
A class for loading PDF files and extracting text artifacts.
Args:
tokenizer (str): The tokenizer to use for chunking the text.
max_tokens (int): The maximum number of tokens per chunk.
Methods:
load(source, password=None, *args, **kwargs):
Load a single PDF file and extract text artifacts.
load_collection(sources, password=None, *args, **kwargs):
Load a collection of PDF files and extract text artifacts.
Private Methods:
_load_pdf(stream, password=None):
Load a PDF file and extract text artifacts.
Attributes:
tokenizer (str): The tokenizer used for chunking the text.
max_tokens (int): The maximum number of tokens per chunk.
"""
tokenizer: str
max_tokens: int
def __post_init__(self):
self.chunker = PdfChunker(
tokenizer=self.tokenizer, max_tokens=self.max_tokens
)
def load(
self,
source: str | IO | Path,
password: str | None = None,
*args,
**kwargs,
) -> list[TextArtifact]:
return self._load_pdf(source, password)
def load_collection(
self,
sources: list[str | IO | Path],
password: str | None = None,
*args,
**kwargs,
) -> dict[str, list[TextArtifact]]:
return {
str_to_hash(str(s)): self._load_pdf(s, password)
for s in sources
}
def _load_pdf(
self, stream: str | IO | Path, password: str | None
) -> list[TextArtifact]:
reader = PdfReader(stream, strict=True, password=password)
return [
TextArtifact(text=p.extract_text()) for p in reader.pages
]
Loading…
Cancel
Save