From 4ece24851ff9dc48c0290100436a40ebcb243422 Mon Sep 17 00:00:00 2001 From: Kye Date: Wed, 13 Dec 2023 01:04:01 -0800 Subject: [PATCH] [GEMINI][FEAT][TESTS][DOCS] --- docs/swarms/models/gemini.md | 180 +++++++++++++++++++++++++++++ pyproject.toml | 2 +- swarms/models/gemini.py | 160 +++++++++++++++++++++++++ tests/models/test_gemini.py | 218 +++++++++++++++++++++++++++++++++++ 4 files changed, 559 insertions(+), 1 deletion(-) create mode 100644 docs/swarms/models/gemini.md create mode 100644 swarms/models/gemini.py create mode 100644 tests/models/test_gemini.py diff --git a/docs/swarms/models/gemini.md b/docs/swarms/models/gemini.md new file mode 100644 index 00000000..3c2c35b7 --- /dev/null +++ b/docs/swarms/models/gemini.md @@ -0,0 +1,180 @@ +## `Gemini` Documentation + +### Introduction + +The Gemini module is a versatile tool for leveraging the power of multimodal AI models to generate content. It allows users to combine textual and image inputs to generate creative and informative outputs. In this documentation, we will explore the Gemini module in detail, covering its purpose, architecture, methods, and usage examples. + +#### Purpose + +The Gemini module is designed to bridge the gap between text and image data, enabling users to harness the capabilities of multimodal AI models effectively. By providing both a textual task and an image as input, Gemini generates content that aligns with the specified task and incorporates the visual information from the image. + +### Installation + +Before using Gemini, ensure that you have the required dependencies installed. You can install them using the following commands: + +```bash +pip install swarms +pip install google-generativeai +pip install python-dotenv +``` + +### Class: Gemini + +#### Overview + +The `Gemini` class is the central component of the Gemini module. It inherits from the `BaseMultiModalModel` class and provides methods to interact with the Gemini AI model. Let's dive into its architecture and functionality. + +##### Class Constructor + +```python +class Gemini(BaseMultiModalModel): + def __init__( + self, + model_name: str = "gemini-pro", + gemini_api_key: str = get_gemini_api_key_env, + *args, + **kwargs, + ): +``` + +| Parameter | Type | Description | Default Value | +|---------------------|---------|------------------------------------------------------------------|--------------------| +| `model_name` | str | The name of the Gemini model. | "gemini-pro" | +| `gemini_api_key` | str | The Gemini API key. If not provided, it is fetched from the environment. | (None) | + +- `model_name`: Specifies the name of the Gemini model to use. By default, it is set to "gemini-pro," but you can specify a different model if needed. + +- `gemini_api_key`: This parameter allows you to provide your Gemini API key directly. If not provided, the constructor attempts to fetch it from the environment using the `get_gemini_api_key_env` helper function. + +##### Methods + +1. **run()** + + ```python + def run( + self, + task: str = None, + img: str = None, + *args, + **kwargs, + ) -> str: + ``` + + | Parameter | Type | Description | + |---------------|----------|--------------------------------------------| + | `task` | str | The textual task for content generation. | + | `img` | str | The path to the image to be processed. | + | `*args` | Variable | Additional positional arguments. | + | `**kwargs` | Variable | Additional keyword arguments. | + + - `task`: Specifies the textual task for content generation. It can be a sentence or a phrase that describes the desired content. + + - `img`: Provides the path to the image that will be processed along with the textual task. Gemini combines the visual information from the image with the textual task to generate content. + + - `*args` and `**kwargs`: Allow for additional, flexible arguments that can be passed to the underlying Gemini model. These arguments can vary based on the specific Gemini model being used. + + **Returns**: A string containing the generated content. + + **Examples**: + + ```python + from swarms.models import Gemini + + # Initialize the Gemini model + gemini = Gemini() + + # Generate content for a textual task with an image + generated_content = gemini.run( + task="Describe this image", + img="image.jpg", + ) + + # Print the generated content + print(generated_content) + ``` + + In this example, we initialize the Gemini model, provide a textual task, and specify an image for processing. The `run()` method generates content based on the input and returns the result. + +2. **process_img()** + + ```python + def process_img( + self, + img: str = None, + type: str = "image/png", + *args, + **kwargs, + ): + ``` + + | Parameter | Type | Description | Default Value | + |---------------|----------|------------------------------------------------------|----------------| + | `img` | str | The path to the image to be processed. | (None) | + | `type` | str | The MIME type of the image (e.g., "image/png"). | "image/png" | + | `*args` | Variable | Additional positional arguments. | + | `**kwargs` | Variable | Additional keyword arguments. | + + - `img`: Specifies the path to the image that will be processed. It's essential to provide a valid image path for image-based content generation. + + - `type`: Indicates the MIME type of the image. By default, it is set to "image/png," but you can change it based on the image format you're using. + + - `*args` and `**kwargs`: Allow for additional, flexible arguments that can be passed to the underlying Gemini model. These arguments can vary based on the specific Gemini model being used. + + **Raises**: ValueError if any of the following conditions are met: + - No image is provided. + - The image type is not specified. + - The Gemini API key is missing. + + **Examples**: + + ```python + from swarms.models.gemini import Gemini + + # Initialize the Gemini model + gemini = Gemini() + + # Process an image + processed_image = gemini.process_img( + img="image.jpg", + type="image/jpeg", + ) + + # Further use the processed image in content generation + generated_content = gemini.run( + task="Describe this image", + img=processed_image, + ) + + # Print the generated content + print(generated_content) + ``` + + In this example, we demonstrate how to process an image using the `process_img()` method and then use the processed image in content generation. + +#### Additional Information + +- Gemini is designed to work seamlessly with various multimodal AI models, making it a powerful tool for content generation tasks. + +- The module uses the `google.generativeai` package to access the underlying AI models. Ensure that you have this package installed to leverage the full capabilities of Gemini. + +- It's essential to provide a valid Gemini API key for authentication. You can either pass it directly during initialization or store it in the environment variable "GEMINI_API_KEY." + +- Gemini's flexibility allows you to experiment with different Gemini models and tailor the content generation process to your specific needs. + +- Keep in mind that Gemini is designed to handle both textual and image inputs, making it a valuable asset for various applications, including natural language processing and computer vision tasks. + +- If you encounter any issues or have specific requirements, refer to the Gemini documentation for more details and advanced usage. + +### References and Resources + +- [Gemini GitHub Repository](https://github.com/swarms/gemini): Explore the Gemini repository for additional information, updates, and examples. + +- [Google GenerativeAI + + Documentation](https://docs.google.com/document/d/1WZSBw6GsOhOCYm0ArydD_9uy6nPPA1KFIbKPhjj43hA): Dive deeper into the capabilities of the Google GenerativeAI package used by Gemini. + +- [Gemini API Documentation](https://gemini-api-docs.example.com): Access the official documentation for the Gemini API to explore advanced features and integrations. + +## Conclusion + +In this comprehensive documentation, we've explored the Gemini module, its purpose, architecture, methods, and usage examples. Gemini empowers developers to generate content by combining textual tasks and images, making it a valuable asset for multimodal AI applications. Whether you're working on natural language processing or computer vision projects, Gemini can help you achieve impressive results. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 693ede3a..ef30e3c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ openai = "0.28.0" langchain = "*" asyncio = "*" einops = "*" -google-generativeai = "*" +google-generativeai = "0.3.0" langchain-experimental = "*" playwright = "*" weaviate-client = "*" diff --git a/swarms/models/gemini.py b/swarms/models/gemini.py new file mode 100644 index 00000000..9a052bb2 --- /dev/null +++ b/swarms/models/gemini.py @@ -0,0 +1,160 @@ +import os +import subprocess as sp +from pathlib import Path + +from dotenv import load_dotenv + +from swarms.models.base_multimodal_model import BaseMultiModalModel + +try: + import google.generativeai as genai +except ImportError as error: + print(f"Error importing google.generativeai: {error}") + print("Please install the google.generativeai package") + print("pip install google-generativeai") + sp.run(["pip", "install", "--upgrade", "google-generativeai"]) + + +load_dotenv() + + +# Helpers +def get_gemini_api_key_env(): + """Get the Gemini API key from the environment + + Raises: + ValueError: _description_ + + Returns: + _type_: _description_ + """ + key = os.getenv("GEMINI_API_KEY") + if key is None: + raise ValueError("Please provide a Gemini API key") + return key + + +# Main class +class Gemini(BaseMultiModalModel): + """Gemini model + + Args: + BaseMultiModalModel (class): Base multimodal model class + model_name (str, optional): model name. Defaults to "gemini-pro". + gemini_api_key (str, optional): Gemini API key. Defaults to None. + + Methods: + run: run the Gemini model + process_img: process the image + + + Examples: + >>> from swarms.models import Gemini + >>> gemini = Gemini() + >>> gemini.run( + task="A dog", + img="dog.png", + ) + """ + + def __init__( + self, + model_name: str = "gemini-pro", + gemini_api_key: str = get_gemini_api_key_env, + *args, + **kwargs, + ): + super().__init__(model_name, *args, **kwargs) + self.model_name = model_name + self.gemini_api_key = gemini_api_key + + # Initialize the model + self.model = genai.GenerativeModel( + model_name, *args, **kwargs + ) + + def run( + self, + task: str = None, + img: str = None, + *args, + **kwargs, + ) -> str: + """Run the Gemini model + + Args: + task (str, optional): textual task. Defaults to None. + img (str, optional): img. Defaults to None. + + Returns: + str: output from the model + """ + try: + if img: + process_img = self.process_img(img, *args, **kwargs) + response = self.model.generate_content( + content=[task, process_img], *args, **kwargs + ) + return response.text + else: + response = self.model.generate_content( + task, *args, **kwargs + ) + return response + except Exception as error: + print(f"Error running Gemini model: {error}") + + def process_img( + self, + img: str = None, + type: str = "image/png", + *args, + **kwargs, + ): + """Process the image + + Args: + img (str, optional): _description_. Defaults to None. + type (str, optional): _description_. Defaults to "image/png". + + Raises: + ValueError: _description_ + ValueError: _description_ + ValueError: _description_ + """ + try: + if img is None: + raise ValueError("Please provide an image to process") + if type is None: + raise ValueError("Please provide the image type") + if self.gemini_api_key is None: + raise ValueError("Please provide a Gemini API key") + + # Load the image + img = [ + {"mime_type": type, "data": Path(img).read_bytes()} + ] + except Exception as error: + print(f"Error processing image: {error}") + + def chat( + self, + task: str = None, + img: str = None, + *args, + **kwargs, + ) -> str: + """Chat with the Gemini model + + Args: + task (str, optional): _description_. Defaults to None. + img (str, optional): _description_. Defaults to None. + + Returns: + str: _description_ + """ + chat = self.model.start_chat() + response = chat.send_message(task, *args, **kwargs) + response1 = response.text + print(response1) + response = chat.send_message(img, *args, **kwargs) diff --git a/tests/models/test_gemini.py b/tests/models/test_gemini.py new file mode 100644 index 00000000..bbde19cb --- /dev/null +++ b/tests/models/test_gemini.py @@ -0,0 +1,218 @@ +import pytest +from unittest.mock import patch, Mock +from swarms.models.gemini import Gemini + + +# Define test fixtures +@pytest.fixture +def mock_gemini_api_key(monkeypatch): + monkeypatch.setenv("GEMINI_API_KEY", "mocked-api-key") + + +@pytest.fixture +def mock_genai_model(): + return Mock() + + +# Test initialization of Gemini +def test_gemini_init_defaults(mock_gemini_api_key, mock_genai_model): + model = Gemini() + assert model.model_name == "gemini-pro" + assert model.gemini_api_key == "mocked-api-key" + assert model.model is mock_genai_model + + +def test_gemini_init_custom_params( + mock_gemini_api_key, mock_genai_model +): + model = Gemini( + model_name="custom-model", gemini_api_key="custom-api-key" + ) + assert model.model_name == "custom-model" + assert model.gemini_api_key == "custom-api-key" + assert model.model is mock_genai_model + + +# Test Gemini run method +@patch("swarms.models.gemini.Gemini.process_img") +@patch("swarms.models.gemini.genai.GenerativeModel.generate_content") +def test_gemini_run_with_img( + mock_generate_content, + mock_process_img, + mock_gemini_api_key, + mock_genai_model, +): + model = Gemini() + task = "A cat" + img = "cat.png" + response_mock = Mock(text="Generated response") + mock_generate_content.return_value = response_mock + mock_process_img.return_value = "Processed image" + + response = model.run(task=task, img=img) + + assert response == "Generated response" + mock_generate_content.assert_called_with( + content=[task, "Processed image"] + ) + mock_process_img.assert_called_with(img=img) + + +@patch("swarms.models.gemini.genai.GenerativeModel.generate_content") +def test_gemini_run_without_img( + mock_generate_content, mock_gemini_api_key, mock_genai_model +): + model = Gemini() + task = "A cat" + response_mock = Mock(text="Generated response") + mock_generate_content.return_value = response_mock + + response = model.run(task=task) + + assert response == "Generated response" + mock_generate_content.assert_called_with(task=task) + + +@patch("swarms.models.gemini.genai.GenerativeModel.generate_content") +def test_gemini_run_exception( + mock_generate_content, mock_gemini_api_key, mock_genai_model +): + model = Gemini() + task = "A cat" + mock_generate_content.side_effect = Exception("Test exception") + + response = model.run(task=task) + + assert response is None + + +# Test Gemini process_img method +def test_gemini_process_img(mock_gemini_api_key, mock_genai_model): + model = Gemini(gemini_api_key="custom-api-key") + img = "cat.png" + img_data = b"Mocked image data" + + with patch("builtins.open", create=True) as open_mock: + open_mock.return_value.__enter__.return_value.read.return_value = ( + img_data + ) + + processed_img = model.process_img(img) + + assert processed_img == [ + {"mime_type": "image/png", "data": img_data} + ] + open_mock.assert_called_with(img, "rb") + + +# Test Gemini initialization with missing API key +def test_gemini_init_missing_api_key(): + with pytest.raises( + ValueError, match="Please provide a Gemini API key" + ): + model = Gemini(gemini_api_key=None) + + +# Test Gemini initialization with missing model name +def test_gemini_init_missing_model_name(): + with pytest.raises( + ValueError, match="Please provide a model name" + ): + model = Gemini(model_name=None) + + +# Test Gemini run method with empty task +def test_gemini_run_empty_task(mock_gemini_api_key, mock_genai_model): + model = Gemini() + task = "" + response = model.run(task=task) + assert response is None + + +# Test Gemini run method with empty image +def test_gemini_run_empty_img(mock_gemini_api_key, mock_genai_model): + model = Gemini() + task = "A cat" + img = "" + response = model.run(task=task, img=img) + assert response is None + + +# Test Gemini process_img method with missing image +def test_gemini_process_img_missing_image( + mock_gemini_api_key, mock_genai_model +): + model = Gemini() + img = None + with pytest.raises( + ValueError, match="Please provide an image to process" + ): + model.process_img(img=img) + + +# Test Gemini process_img method with missing image type +def test_gemini_process_img_missing_image_type( + mock_gemini_api_key, mock_genai_model +): + model = Gemini() + img = "cat.png" + with pytest.raises( + ValueError, match="Please provide the image type" + ): + model.process_img(img=img, type=None) + + +# Test Gemini process_img method with missing Gemini API key +def test_gemini_process_img_missing_api_key(mock_genai_model): + model = Gemini(gemini_api_key=None) + img = "cat.png" + with pytest.raises( + ValueError, match="Please provide a Gemini API key" + ): + model.process_img(img=img, type="image/png") + + +# Test Gemini run method with mocked image processing +@patch("swarms.models.gemini.genai.GenerativeModel.generate_content") +@patch("swarms.models.gemini.Gemini.process_img") +def test_gemini_run_mock_img_processing( + mock_process_img, + mock_generate_content, + mock_gemini_api_key, + mock_genai_model, +): + model = Gemini() + task = "A cat" + img = "cat.png" + response_mock = Mock(text="Generated response") + mock_generate_content.return_value = response_mock + mock_process_img.return_value = "Processed image" + + response = model.run(task=task, img=img) + + assert response == "Generated response" + mock_generate_content.assert_called_with( + content=[task, "Processed image"] + ) + mock_process_img.assert_called_with(img=img) + + +# Test Gemini run method with mocked image processing and exception +@patch("swarms.models.gemini.Gemini.process_img") +@patch("swarms.models.gemini.genai.GenerativeModel.generate_content") +def test_gemini_run_mock_img_processing_exception( + mock_generate_content, + mock_process_img, + mock_gemini_api_key, + mock_genai_model, +): + model = Gemini() + task = "A cat" + img = "cat.png" + mock_process_img.side_effect = Exception("Test exception") + + response = model.run(task=task, img=img) + + assert response is None + mock_generate_content.assert_not_called() + mock_process_img.assert_called_with(img=img)