You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
1.3 KiB
54 lines
1.3 KiB
from swarms.utils.try_except_wrapper import try_except_wrapper
|
|
|
|
try:
|
|
import pypdf
|
|
except ImportError:
|
|
import subprocess
|
|
import sys
|
|
|
|
subprocess.check_call(
|
|
[sys.executable, "-m", "pip", "install", "pypdf"]
|
|
)
|
|
import pypdf
|
|
|
|
|
|
@try_except_wrapper
|
|
def pdf_to_text(pdf_path: str) -> str:
|
|
"""
|
|
Converts a PDF file to a string of text.
|
|
|
|
Args:
|
|
pdf_path (str): The path to the PDF file to be converted.
|
|
|
|
Returns:
|
|
str: The text extracted from the PDF.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the PDF file is not found at the specified path.
|
|
Exception: If there is an error in reading the PDF file.
|
|
"""
|
|
try:
|
|
# Open the PDF file
|
|
with open(pdf_path, "rb") as file:
|
|
pdf_reader = pypdf.PdfReader(file)
|
|
text = ""
|
|
|
|
# Iterate through each page and extract text
|
|
for page in pdf_reader.pages:
|
|
text += page.extract_text() + "\n"
|
|
|
|
return text
|
|
except FileNotFoundError:
|
|
raise FileNotFoundError(
|
|
f"The file at {pdf_path} was not found."
|
|
)
|
|
except Exception as e:
|
|
raise Exception(
|
|
f"An error occurred while reading the PDF file: {e}"
|
|
)
|
|
|
|
|
|
# Example usage
|
|
# text = pdf_to_text("test.pdf")
|
|
# print(text)
|