swarms/examples/memory/pinecone.py

from typing import Optional

import pinecone
from attr import define, field

from swarms.memory.base_vectordb import BaseVectorDatabase
from swarms.utils import str_to_hash


@define
class PineconeDB(BaseVectorDatabase):
    """
    PineconeDB is a vector storage driver that uses Pinecone as the underlying storage engine.

    Pinecone is a vector database that allows you to store, search, and retrieve high-dimensional vectors with
    blazing speed and low latency. It is a managed service that is easy to use and scales effortlessly, so you can
    focus on building your applications instead of managing your infrastructure.

    Args:
        api_key (str): The API key for your Pinecone account.
        index_name (str): The name of the index to use.
        environment (str): The environment to use. Either "us-west1-gcp" or "us-east1-gcp".
        project_name (str, optional): The name of the project to use. Defaults to None.
        index (pinecone.Index, optional): The Pinecone index to use. Defaults to None.

    Methods:
        upsert_vector(vector: list[float], vector_id: Optional[str] = None, namespace: Optional[str] = None, meta: Optional[dict] = None, **kwargs) -> str:
            Upserts a vector into the index.
        load_entry(vector_id: str, namespace: Optional[str] = None) -> Optional[BaseVectorStore.Entry]:
            Loads a single vector from the index.
        load_entries(namespace: Optional[str] = None) -> list[BaseVectorStore.Entry]:
            Loads all vectors from the index.
        query(query: str, count: Optional[int] = None, namespace: Optional[str] = None, include_vectors: bool = False, include_metadata=True, **kwargs) -> list[BaseVectorStore.QueryResult]:
            Queries the index for vectors similar to the given query string.
        create_index(name: str, **kwargs) -> None:
            Creates a new index.

    Usage:
    >>> from swarms.memory.vector_stores.pinecone import PineconeDB
    >>> from swarms.utils.embeddings import USEEmbedding
    >>> from swarms.utils.hash import str_to_hash
    >>> from swarms.utils.dataframe import dataframe_to_hash
    >>> import pandas as pd
    >>>
    >>> # Create a new PineconeDB instance:
    >>> pv = PineconeDB(
    >>>     api_key="your-api-key",
    >>>     index_name="your-index-name",
    >>>     environment="us-west1-gcp",
    >>>     project_name="your-project-name"
    >>> )
    >>> # Create a new index:
    >>> pv.create_index("your-index-name")
    >>> # Create a new USEEmbedding instance:
    >>> use = USEEmbedding()
    >>> # Create a new dataframe:
    >>> df = pd.DataFrame({
    >>>     "text": [
    >>>         "This is a test",
    >>>         "This is another test",
    >>>         "This is a third test"
    >>>     ]
    >>> })
    >>> # Embed the dataframe:
    >>> df["embedding"] = df["text"].apply(use.embed_string)
    >>> # Upsert the dataframe into the index:
    >>> pv.upsert_vector(
    >>>     vector=df["embedding"].tolist(),
    >>>     vector_id=dataframe_to_hash(df),
    >>>     namespace="your-namespace"
    >>> )
    >>> # Query the index:
    >>> pv.query(
    >>>     query="This is a test",
    >>>     count=10,
    >>>     namespace="your-namespace"
    >>> )
    >>> # Load a single entry from the index:
    >>> pv.load_entry(
    >>>     vector_id=dataframe_to_hash(df),
    >>>     namespace="your-namespace"
    >>> )
    >>> # Load all entries from the index:
    >>> pv.load_entries(
    >>>     namespace="your-namespace"
    >>> )


    """

    api_key: str = field(kw_only=True)
    index_name: str = field(kw_only=True)
    environment: str = field(kw_only=True)
    project_name: Optional[str] = field(default=None, kw_only=True)
    index: pinecone.Index = field(init=False)

    def __attrs_post_init__(self) -> None:
        """Post init"""
        pinecone.init(
            api_key=self.api_key,
            environment=self.environment,
            project_name=self.project_name,
        )

        self.index = pinecone.Index(self.index_name)

    def add(
        self,
        vector: list[float],
        vector_id: Optional[str] = None,
        namespace: Optional[str] = None,
        meta: Optional[dict] = None,
        **kwargs,
    ) -> str:
        """Add a vector to the index.

        Args:
            vector (list[float]): _description_
            vector_id (Optional[str], optional): _description_. Defaults to None.
            namespace (Optional[str], optional): _description_. Defaults to None.
            meta (Optional[dict], optional): _description_. Defaults to None.

        Returns:
            str: _description_
        """
        vector_id = vector_id if vector_id else str_to_hash(str(vector))

        params = {"namespace": namespace} | kwargs

        self.index.upsert([(vector_id, vector, meta)], **params)

        return vector_id

    def load_entries(self, namespace: Optional[str] = None):
        """Load all entries from the index.

        Args:
            namespace (Optional[str], optional): _description_. Defaults to None.

        Returns:
            _type_: _description_
        """
        # This is a hacky way to query up to 10,000 values from Pinecone. Waiting on an official API for fetching
        # all values from a namespace:
        # https://community.pinecone.io/t/is-there-a-way-to-query-all-the-vectors-and-or-metadata-from-a-namespace/797/5

        results = self.index.query(
            self.embedding_driver.embed_string(""),
            top_k=10000,
            include_metadata=True,
            namespace=namespace,
        )

        for result in results["matches"]:
            entry = {
                "id": result["id"],
                "vector": result["values"],
                "meta": result["metadata"],
                "namespace": result["namespace"],
            }
            return entry

    def query(
        self,
        query: str,
        count: Optional[int] = None,
        namespace: Optional[str] = None,
        include_vectors: bool = False,
        # PineconeDBStorageDriver-specific params:
        include_metadata=True,
        **kwargs,
    ):
        """Query the index for vectors similar to the given query string.

        Args:
            query (str): _description_
            count (Optional[int], optional): _description_. Defaults to None.
            namespace (Optional[str], optional): _description_. Defaults to None.
            include_vectors (bool, optional): _description_. Defaults to False.
            include_metadata (bool, optional): _description_. Defaults to True.

        Returns:
            _type_: _description_
        """
        vector = self.embedding_driver.embed_string(query)

        params = {
            "top_k": count,
            "namespace": namespace,
            "include_values": include_vectors,
            "include_metadata": include_metadata,
        } | kwargs

        results = self.index.query(vector, **params)

        for r in results["matches"]:
            entry = {
                "id": results["id"],
                "vector": results["values"],
                "score": results["scores"],
                "meta": results["metadata"],
                "namespace": results["namespace"],
            }
            return entry

    def create_index(self, name: str, **kwargs) -> None:
        """Create a new index.

        Args:
            name (str): _description_
        """
        params = {
            "name": name,
            "dimension": self.embedding_driver.dimensions,
        } | kwargs

        pinecone.create_index(**params)
[5.4.8] 5 months ago			`from typing import Optional`

			`import pinecone`
			`from attr import define, field`

			`from swarms.memory.base_vectordb import BaseVectorDatabase`
			`from swarms.utils import str_to_hash`


			`@define`
			`class PineconeDB(BaseVectorDatabase):`
			`"""`
			`PineconeDB is a vector storage driver that uses Pinecone as the underlying storage engine.`

			`Pinecone is a vector database that allows you to store, search, and retrieve high-dimensional vectors with`
			`blazing speed and low latency. It is a managed service that is easy to use and scales effortlessly, so you can`
			`focus on building your applications instead of managing your infrastructure.`

			`Args:`
			`api_key (str): The API key for your Pinecone account.`
			`index_name (str): The name of the index to use.`
			`environment (str): The environment to use. Either "us-west1-gcp" or "us-east1-gcp".`
			`project_name (str, optional): The name of the project to use. Defaults to None.`
			`index (pinecone.Index, optional): The Pinecone index to use. Defaults to None.`

			`Methods:`
			`upsert_vector(vector: list[float], vector_id: Optional[str] = None, namespace: Optional[str] = None, meta: Optional[dict] = None, **kwargs) -> str:`
			`Upserts a vector into the index.`
			`load_entry(vector_id: str, namespace: Optional[str] = None) -> Optional[BaseVectorStore.Entry]:`
			`Loads a single vector from the index.`
			`load_entries(namespace: Optional[str] = None) -> list[BaseVectorStore.Entry]:`
			`Loads all vectors from the index.`
			`query(query: str, count: Optional[int] = None, namespace: Optional[str] = None, include_vectors: bool = False, include_metadata=True, **kwargs) -> list[BaseVectorStore.QueryResult]:`
			`Queries the index for vectors similar to the given query string.`
			`create_index(name: str, **kwargs) -> None:`
			`Creates a new index.`

			`Usage:`
			`>>> from swarms.memory.vector_stores.pinecone import PineconeDB`
			`>>> from swarms.utils.embeddings import USEEmbedding`
			`>>> from swarms.utils.hash import str_to_hash`
			`>>> from swarms.utils.dataframe import dataframe_to_hash`
			`>>> import pandas as pd`
			`>>>`
			`>>> # Create a new PineconeDB instance:`
			`>>> pv = PineconeDB(`
			`>>> api_key="your-api-key",`
			`>>> index_name="your-index-name",`
			`>>> environment="us-west1-gcp",`
			`>>> project_name="your-project-name"`
			`>>> )`
			`>>> # Create a new index:`
			`>>> pv.create_index("your-index-name")`
			`>>> # Create a new USEEmbedding instance:`
			`>>> use = USEEmbedding()`
			`>>> # Create a new dataframe:`
			`>>> df = pd.DataFrame({`
			`>>> "text": [`
			`>>> "This is a test",`
			`>>> "This is another test",`
			`>>> "This is a third test"`
			`>>> ]`
			`>>> })`
			`>>> # Embed the dataframe:`
			`>>> df["embedding"] = df["text"].apply(use.embed_string)`
			`>>> # Upsert the dataframe into the index:`
			`>>> pv.upsert_vector(`
			`>>> vector=df["embedding"].tolist(),`
			`>>> vector_id=dataframe_to_hash(df),`
			`>>> namespace="your-namespace"`
			`>>> )`
			`>>> # Query the index:`
			`>>> pv.query(`
			`>>> query="This is a test",`
			`>>> count=10,`
			`>>> namespace="your-namespace"`
			`>>> )`
			`>>> # Load a single entry from the index:`
			`>>> pv.load_entry(`
			`>>> vector_id=dataframe_to_hash(df),`
			`>>> namespace="your-namespace"`
			`>>> )`
			`>>> # Load all entries from the index:`
			`>>> pv.load_entries(`
			`>>> namespace="your-namespace"`
			`>>> )`


			`"""`

			`api_key: str = field(kw_only=True)`
			`index_name: str = field(kw_only=True)`
			`environment: str = field(kw_only=True)`
			`project_name: Optional[str] = field(default=None, kw_only=True)`
			`index: pinecone.Index = field(init=False)`

			`def __attrs_post_init__(self) -> None:`
			`"""Post init"""`
			`pinecone.init(`
			`api_key=self.api_key,`
			`environment=self.environment,`
			`project_name=self.project_name,`
			`)`

			`self.index = pinecone.Index(self.index_name)`

			`def add(`
			`self,`
			`vector: list[float],`
			`vector_id: Optional[str] = None,`
			`namespace: Optional[str] = None,`
			`meta: Optional[dict] = None,`
			`**kwargs,`
			`) -> str:`
			`"""Add a vector to the index.`

			`Args:`
			`vector (list[float]): _description_`
			`vector_id (Optional[str], optional): _description_. Defaults to None.`
			`namespace (Optional[str], optional): _description_. Defaults to None.`
			`meta (Optional[dict], optional): _description_. Defaults to None.`

			`Returns:`
			`str: _description_`
			`"""`
			`vector_id = vector_id if vector_id else str_to_hash(str(vector))`

			`params = {"namespace": namespace} \| kwargs`

			`self.index.upsert([(vector_id, vector, meta)], **params)`

			`return vector_id`

			`def load_entries(self, namespace: Optional[str] = None):`
			`"""Load all entries from the index.`

			`Args:`
			`namespace (Optional[str], optional): _description_. Defaults to None.`

			`Returns:`
			`_type_: _description_`
			`"""`
			`# This is a hacky way to query up to 10,000 values from Pinecone. Waiting on an official API for fetching`
			`# all values from a namespace:`
			`# https://community.pinecone.io/t/is-there-a-way-to-query-all-the-vectors-and-or-metadata-from-a-namespace/797/5`

			`results = self.index.query(`
			`self.embedding_driver.embed_string(""),`
			`top_k=10000,`
			`include_metadata=True,`
			`namespace=namespace,`
			`)`

			`for result in results["matches"]:`
			`entry = {`
			`"id": result["id"],`
			`"vector": result["values"],`
			`"meta": result["metadata"],`
			`"namespace": result["namespace"],`
			`}`
			`return entry`

			`def query(`
			`self,`
			`query: str,`
			`count: Optional[int] = None,`
			`namespace: Optional[str] = None,`
			`include_vectors: bool = False,`
			`# PineconeDBStorageDriver-specific params:`
			`include_metadata=True,`
			`**kwargs,`
			`):`
			`"""Query the index for vectors similar to the given query string.`

			`Args:`
			`query (str): _description_`
			`count (Optional[int], optional): _description_. Defaults to None.`
			`namespace (Optional[str], optional): _description_. Defaults to None.`
			`include_vectors (bool, optional): _description_. Defaults to False.`
			`include_metadata (bool, optional): _description_. Defaults to True.`

			`Returns:`
			`_type_: _description_`
			`"""`
			`vector = self.embedding_driver.embed_string(query)`

			`params = {`
			`"top_k": count,`
			`"namespace": namespace,`
			`"include_values": include_vectors,`
			`"include_metadata": include_metadata,`
			`} \| kwargs`

			`results = self.index.query(vector, **params)`

			`for r in results["matches"]:`
			`entry = {`
			`"id": results["id"],`
			`"vector": results["values"],`
			`"score": results["scores"],`
			`"meta": results["metadata"],`
			`"namespace": results["namespace"],`
			`}`
			`return entry`

			`def create_index(self, name: str, **kwargs) -> None:`
			`"""Create a new index.`

			`Args:`
			`name (str): _description_`
			`"""`
			`params = {`
			`"name": name,`
			`"dimension": self.embedding_driver.dimensions,`
			`} \| kwargs`

			`pinecone.create_index(**params)`