You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
swarms/swarms/chunkers/pdf.py

20 lines
430 B

from swarms.chunkers.base import BaseChunker
from swarms.chunkers.chunk_seperator import ChunkSeparator
class PdfChunker(BaseChunker):
DEFAULT_SEPARATORS = [
ChunkSeparator("\n\n"),
ChunkSeparator(". "),
ChunkSeparator("! "),
ChunkSeparator("? "),
ChunkSeparator(" "),
]
# # Example
# pdf = "swarmdeck.pdf"
# chunker = PdfChunker()
# chunks = chunker.chunk(pdf)
# print(chunks)