You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
swarms/swarms/utils/pdf_to_text.py

51 lines
1.2 KiB

import sys
try:
import pypdf
except ImportError:
print(
"pypdf not installed. Please install it using: pip install"
" pypdf"
)
sys.exit(1)
def pdf_to_text(pdf_path):
"""
Converts a PDF file to a string of text.
Args:
pdf_path (str): The path to the PDF file to be converted.
Returns:
str: The text extracted from the PDF.
Raises:
FileNotFoundError: If the PDF file is not found at the specified path.
Exception: If there is an error in reading the PDF file.
"""
try:
# Open the PDF file
with open(pdf_path, "rb") as file:
pdf_reader = pypdf.PdfReader(file)
text = ""
# Iterate through each page and extract text
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except FileNotFoundError:
raise FileNotFoundError(
f"The file at {pdf_path} was not found."
)
except Exception as e:
raise Exception(
f"An error occurred while reading the PDF file: {e}"
)
# Example usage
# text = pdf_to_text("test.pdf")
# print(text)