parent
ea98cbdae8
commit
2c9bad43fb
@ -1,77 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import IO
|
|
||||||
|
|
||||||
from pypdf import PdfReader
|
|
||||||
|
|
||||||
from swarms.utils.hash import str_to_hash
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TextArtifact:
|
|
||||||
text: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class PDFLoader:
|
|
||||||
"""
|
|
||||||
A class for loading PDF files and extracting text artifacts.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tokenizer (str): The tokenizer to use for chunking the text.
|
|
||||||
max_tokens (int): The maximum number of tokens per chunk.
|
|
||||||
|
|
||||||
Methods:
|
|
||||||
load(source, password=None, *args, **kwargs):
|
|
||||||
Load a single PDF file and extract text artifacts.
|
|
||||||
|
|
||||||
load_collection(sources, password=None, *args, **kwargs):
|
|
||||||
Load a collection of PDF files and extract text artifacts.
|
|
||||||
|
|
||||||
Private Methods:
|
|
||||||
_load_pdf(stream, password=None):
|
|
||||||
Load a PDF file and extract text artifacts.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
tokenizer (str): The tokenizer used for chunking the text.
|
|
||||||
max_tokens (int): The maximum number of tokens per chunk.
|
|
||||||
"""
|
|
||||||
|
|
||||||
tokenizer: str
|
|
||||||
max_tokens: int
|
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
self.chunker = PdfChunker(
|
|
||||||
tokenizer=self.tokenizer, max_tokens=self.max_tokens
|
|
||||||
)
|
|
||||||
|
|
||||||
def load(
|
|
||||||
self,
|
|
||||||
source: str | IO | Path,
|
|
||||||
password: str | None = None,
|
|
||||||
*args,
|
|
||||||
**kwargs,
|
|
||||||
) -> list[TextArtifact]:
|
|
||||||
return self._load_pdf(source, password)
|
|
||||||
|
|
||||||
def load_collection(
|
|
||||||
self,
|
|
||||||
sources: list[str | IO | Path],
|
|
||||||
password: str | None = None,
|
|
||||||
*args,
|
|
||||||
**kwargs,
|
|
||||||
) -> dict[str, list[TextArtifact]]:
|
|
||||||
return {
|
|
||||||
str_to_hash(str(s)): self._load_pdf(s, password)
|
|
||||||
for s in sources
|
|
||||||
}
|
|
||||||
|
|
||||||
def _load_pdf(
|
|
||||||
self, stream: str | IO | Path, password: str | None
|
|
||||||
) -> list[TextArtifact]:
|
|
||||||
reader = PdfReader(stream, strict=True, password=password)
|
|
||||||
return [
|
|
||||||
TextArtifact(text=p.extract_text()) for p in reader.pages
|
|
||||||
]
|
|
Loading…
Reference in new issue