ascender1729/pavan/detect-binary-extensions-and-handle-errors

Handle binary file loading
pull/885/head
Pavan Kumar 1 month ago committed by GitHub
commit 088442b498
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1,10 +1,24 @@
import csv import csv
import json import json
import os import os
import base64
from swarms.utils.pdf_to_text import pdf_to_text from swarms.utils.pdf_to_text import pdf_to_text
# Binary file extensions that should be opened in binary mode
BINARY_EXTENSIONS = {
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".wav",
".mp3",
".mp4",
}
def csv_to_text(file: str) -> str: def csv_to_text(file: str) -> str:
""" """
Converts a CSV file to text format. Converts a CSV file to text format.
@ -18,6 +32,7 @@ def csv_to_text(file: str) -> str:
Raises: Raises:
FileNotFoundError: If the file does not exist. FileNotFoundError: If the file does not exist.
IOError: If there is an error reading the file. IOError: If there is an error reading the file.
ValueError: If the file extension is unsupported.
""" """
with open(file) as file: with open(file) as file:
@ -93,6 +108,7 @@ def md_to_text(file: str) -> str:
def data_to_text(file: str) -> str: def data_to_text(file: str) -> str:
""" """
Converts the given data file to text format. Converts the given data file to text format.
Binary files are returned as base64 encoded strings.
Args: Args:
file (str): The path to the data file. file (str): The path to the data file.
@ -126,14 +142,23 @@ def data_to_text(file: str) -> str:
return pdf_to_text(file) return pdf_to_text(file)
elif ext == ".md": elif ext == ".md":
return md_to_text(file) return md_to_text(file)
elif ext in BINARY_EXTENSIONS:
try:
with open(file, "rb") as binary_file:
data = base64.b64encode(
binary_file.read()
).decode("utf-8")
return data
except Exception as e:
raise OSError(
f"Error reading binary file: {file}"
) from e
else: else:
# Check if the file is a binary file (like an image) try:
if ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp"]: with open(file, "r", encoding="utf-8") as file_obj:
# Skip binary files data = file_obj.read()
return None
else:
with open(file) as file:
data = file.read()
return data return data
except UnicodeDecodeError:
raise ValueError(f"Unsupported file extension: {ext}")
except Exception as e: except Exception as e:
raise OSError(f"Error reading file: {file}") from e raise OSError(f"Error reading file: {file}") from e

@ -0,0 +1,17 @@
import base64
from swarms.utils.data_to_text import data_to_text
def test_data_to_text_binary(tmp_path):
binary_path = tmp_path / "image.png"
binary_bytes = b"\x89PNG\r\n\x1a\n"
binary_path.write_bytes(binary_bytes)
encoded = data_to_text(str(binary_path))
assert base64.b64decode(encoded) == binary_bytes
def test_data_to_text_text(tmp_path):
text_path = tmp_path / "file.txt"
text_content = "hello"
text_path.write_text(text_content)
assert data_to_text(str(text_path)) == text_content
Loading…
Cancel
Save