From de4088283ed6838de7ddb45f39edabe5d4e206a9 Mon Sep 17 00:00:00 2001 From: Pavan Kumar <66913595+ascender1729@users.noreply.github.com> Date: Fri, 13 Jun 2025 21:29:03 +0530 Subject: [PATCH] Handle binary files in data_to_text --- swarms/utils/data_to_text.py | 39 ++++++++++++++++++++++++++------ tests/utils/test_data_to_text.py | 17 ++++++++++++++ 2 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 tests/utils/test_data_to_text.py diff --git a/swarms/utils/data_to_text.py b/swarms/utils/data_to_text.py index 562f8098..5edaf741 100644 --- a/swarms/utils/data_to_text.py +++ b/swarms/utils/data_to_text.py @@ -1,10 +1,24 @@ import csv import json import os +import base64 from swarms.utils.pdf_to_text import pdf_to_text +# Binary file extensions that should be opened in binary mode +BINARY_EXTENSIONS = { + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".wav", + ".mp3", + ".mp4", +} + + def csv_to_text(file: str) -> str: """ Converts a CSV file to text format. @@ -18,6 +32,7 @@ def csv_to_text(file: str) -> str: Raises: FileNotFoundError: If the file does not exist. IOError: If there is an error reading the file. + ValueError: If the file extension is unsupported. """ with open(file) as file: @@ -93,6 +108,7 @@ def md_to_text(file: str) -> str: def data_to_text(file: str) -> str: """ Converts the given data file to text format. + Binary files are returned as base64 encoded strings. Args: file (str): The path to the data file. @@ -126,14 +142,23 @@ def data_to_text(file: str) -> str: return pdf_to_text(file) elif ext == ".md": return md_to_text(file) + elif ext in BINARY_EXTENSIONS: + try: + with open(file, "rb") as binary_file: + data = base64.b64encode( + binary_file.read() + ).decode("utf-8") + return data + except Exception as e: + raise OSError( + f"Error reading binary file: {file}" + ) from e else: - # Check if the file is a binary file (like an image) - if ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp"]: - # Skip binary files - return None - else: - with open(file) as file: - data = file.read() + try: + with open(file, "r", encoding="utf-8") as file_obj: + data = file_obj.read() return data + except UnicodeDecodeError: + raise ValueError(f"Unsupported file extension: {ext}") except Exception as e: raise OSError(f"Error reading file: {file}") from e diff --git a/tests/utils/test_data_to_text.py b/tests/utils/test_data_to_text.py new file mode 100644 index 00000000..9c1bc102 --- /dev/null +++ b/tests/utils/test_data_to_text.py @@ -0,0 +1,17 @@ +import base64 +from swarms.utils.data_to_text import data_to_text + + +def test_data_to_text_binary(tmp_path): + binary_path = tmp_path / "image.png" + binary_bytes = b"\x89PNG\r\n\x1a\n" + binary_path.write_bytes(binary_bytes) + encoded = data_to_text(str(binary_path)) + assert base64.b64decode(encoded) == binary_bytes + + +def test_data_to_text_text(tmp_path): + text_path = tmp_path / "file.txt" + text_content = "hello" + text_path.write_text(text_content) + assert data_to_text(str(text_path)) == text_content