diff --git a/docs/swarms/utils/pdf_to_text.md b/docs/swarms/utils/pdf_to_text.md index aecde1a9..246ac931 100644 --- a/docs/swarms/utils/pdf_to_text.md +++ b/docs/swarms/utils/pdf_to_text.md @@ -1,7 +1,8 @@ # pdf_to_text -## Introduction -The function `pdf_to_text` is a Python utility for converting a PDF file into a string of text content. It leverages the `PyPDF2` library, an excellent Python library for processing PDF files. The function takes in a PDF file's path and reads its content, subsequently returning the extracted textual data. +## Introduction + +The function `pdf_to_text` is a Python utility for converting a PDF file into a string of text content. It leverages the `pypdf` library, an excellent Python library for processing PDF files. The function takes in a PDF file's path and reads its content, subsequently returning the extracted textual data. This function can be very useful when you want to extract textual information from PDF files automatically. For instance, when processing a large number of documents, performing textual analysis, or when you're dealing with text data that is only available in PDF format. @@ -34,14 +35,14 @@ def pdf_to_text(pdf_path: str) -> str: ## Function Description -`pdf_to_text` utilises the `PdfReader` function from the `PyPDF2` library to read the PDF file. If the PDF file does not exist at the specified path or there was an error while reading the file, appropriate exceptions will be raised. It then iterates through each page in the PDF and uses the `extract_text` function to extract the text content from each page. These contents are then concatenated into a single variable and returned as the result. +`pdf_to_text` utilises the `PdfReader` function from the `pypdf` library to read the PDF file. If the PDF file does not exist at the specified path or there was an error while reading the file, appropriate exceptions will be raised. It then iterates through each page in the PDF and uses the `extract_text` function to extract the text content from each page. These contents are then concatenated into a single variable and returned as the result. ## Usage Examples -To use this function, you first need to install the `PyPDF2` library. It can be installed via pip: +To use this function, you first need to install the `pypdf` library. It can be installed via pip: ```python -!pip install pypdf2 +!pip install pypdf ``` Then, you should import the `pdf_to_text` function: @@ -68,4 +69,4 @@ print(text) - This function reads the text from the PDF. It does not handle images, graphical elements, or any non-text content. - If the PDF contains scanned images rather than textual data, the `extract_text` function may not be able to extract any text. In such cases, you would require OCR (Optical Character Recognition) tools to extract the text. - Be aware of the possibility that the output string might contain special characters or escape sequences because they were part of the PDF's content. You might need to clean the resulting text according to your requirements. -- The function uses the PyPDF2 library to facilitate the PDF reading and text extraction. For any issues related to PDF manipulation, consult the [PyPDF2 library documentation](https://pythonhosted.org/PyPDF2/). +- The function uses the pypdf library to facilitate the PDF reading and text extraction. For any issues related to PDF manipulation, consult the [pypdf library documentation](https://pythonhosted.org/pypdf/). diff --git a/pyproject.toml b/pyproject.toml index 90ab06e6..58ca7f59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ datasets = "*" optimum = "1.15.0" diffusers = "*" toml = "*" -PyPDF2 = "3.0.1" +pypdf = "4.0.1" accelerate = "*" anthropic = "*" sentencepiece = "0.1.98" @@ -77,7 +77,6 @@ pinecone-client = "*" roboflow = "*" - [tool.poetry.group.lint.dependencies] ruff = ">=0.0.249,<0.1.7" types-toml = "^0.10.8.1" diff --git a/requirements.txt b/requirements.txt index e6496205..e582fa25 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ huggingface-hub google-generativeai==0.3.1 sentencepiece==0.1.98 requests_mock -PyPDF2==3.0.1 +pypdf==4.0.1 accelerate==0.22.0 chromadb tensorflow diff --git a/swarms/loaders/pdf_loader.py b/swarms/loaders/pdf_loader.py index f3db1448..17e0b465 100644 --- a/swarms/loaders/pdf_loader.py +++ b/swarms/loaders/pdf_loader.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from pathlib import Path from typing import IO, Dict, List, Optional -from PyPDF2 import PdfReader +from pypdf import PdfReader from swarms.utils.hash import str_to_hash diff --git a/swarms/utils/pdf_to_text.py b/swarms/utils/pdf_to_text.py index 6d589ad5..4877f3b1 100644 --- a/swarms/utils/pdf_to_text.py +++ b/swarms/utils/pdf_to_text.py @@ -1,11 +1,11 @@ import sys try: - import PyPDF2 + import pypdf except ImportError: print( - "PyPDF2 not installed. Please install it using: pip install" - " PyPDF2" + "pypdf not installed. Please install it using: pip install" + " pypdf" ) sys.exit(1) @@ -27,7 +27,7 @@ def pdf_to_text(pdf_path): try: # Open the PDF file with open(pdf_path, "rb") as file: - pdf_reader = PyPDF2.PdfReader(file) + pdf_reader = pypdf.PdfReader(file) text = "" # Iterate through each page and extract text diff --git a/tests/utils/test_pdf_to_text.py b/tests/utils/test_pdf_to_text.py index 57e3b33f..888ed764 100644 --- a/tests/utils/test_pdf_to_text.py +++ b/tests/utils/test_pdf_to_text.py @@ -1,12 +1,12 @@ import pytest -import PyPDF2 +import pypdf from swarms.utils import pdf_to_text @pytest.fixture def pdf_file(tmpdir): - pdf_writer = PyPDF2.PdfWriter() - pdf_page = PyPDF2.pdf.PageObject.createBlankPage(None, 200, 200) + pdf_writer = pypdf.PdfWriter() + pdf_page = pypdf.pdf.PageObject.createBlankPage(None, 200, 200) pdf_writer.add_page(pdf_page) pdf_file = tmpdir.join("temp.pdf") with open(pdf_file, "wb") as output: