from typing import Optional import pinecone from attr import define, field from swarms.memory.base_vectordb import BaseVectorDatabase from swarms.utils import str_to_hash @define class PineconeDB(BaseVectorDatabase): """ PineconeDB is a vector storage driver that uses Pinecone as the underlying storage engine. Pinecone is a vector database that allows you to store, search, and retrieve high-dimensional vectors with blazing speed and low latency. It is a managed service that is easy to use and scales effortlessly, so you can focus on building your applications instead of managing your infrastructure. Args: api_key (str): The API key for your Pinecone account. index_name (str): The name of the index to use. environment (str): The environment to use. Either "us-west1-gcp" or "us-east1-gcp". project_name (str, optional): The name of the project to use. Defaults to None. index (pinecone.Index, optional): The Pinecone index to use. Defaults to None. Methods: upsert_vector(vector: list[float], vector_id: Optional[str] = None, namespace: Optional[str] = None, meta: Optional[dict] = None, **kwargs) -> str: Upserts a vector into the index. load_entry(vector_id: str, namespace: Optional[str] = None) -> Optional[BaseVectorStore.Entry]: Loads a single vector from the index. load_entries(namespace: Optional[str] = None) -> list[BaseVectorStore.Entry]: Loads all vectors from the index. query(query: str, count: Optional[int] = None, namespace: Optional[str] = None, include_vectors: bool = False, include_metadata=True, **kwargs) -> list[BaseVectorStore.QueryResult]: Queries the index for vectors similar to the given query string. create_index(name: str, **kwargs) -> None: Creates a new index. Usage: >>> from swarms.memory.vector_stores.pinecone import PineconeDB >>> from swarms.utils.embeddings import USEEmbedding >>> from swarms.utils.hash import str_to_hash >>> from swarms.utils.dataframe import dataframe_to_hash >>> import pandas as pd >>> >>> # Create a new PineconeDB instance: >>> pv = PineconeDB( >>> api_key="your-api-key", >>> index_name="your-index-name", >>> environment="us-west1-gcp", >>> project_name="your-project-name" >>> ) >>> # Create a new index: >>> pv.create_index("your-index-name") >>> # Create a new USEEmbedding instance: >>> use = USEEmbedding() >>> # Create a new dataframe: >>> df = pd.DataFrame({ >>> "text": [ >>> "This is a test", >>> "This is another test", >>> "This is a third test" >>> ] >>> }) >>> # Embed the dataframe: >>> df["embedding"] = df["text"].apply(use.embed_string) >>> # Upsert the dataframe into the index: >>> pv.upsert_vector( >>> vector=df["embedding"].tolist(), >>> vector_id=dataframe_to_hash(df), >>> namespace="your-namespace" >>> ) >>> # Query the index: >>> pv.query( >>> query="This is a test", >>> count=10, >>> namespace="your-namespace" >>> ) >>> # Load a single entry from the index: >>> pv.load_entry( >>> vector_id=dataframe_to_hash(df), >>> namespace="your-namespace" >>> ) >>> # Load all entries from the index: >>> pv.load_entries( >>> namespace="your-namespace" >>> ) """ api_key: str = field(kw_only=True) index_name: str = field(kw_only=True) environment: str = field(kw_only=True) project_name: Optional[str] = field(default=None, kw_only=True) index: pinecone.Index = field(init=False) def __attrs_post_init__(self) -> None: """Post init""" pinecone.init( api_key=self.api_key, environment=self.environment, project_name=self.project_name, ) self.index = pinecone.Index(self.index_name) def add( self, vector: list[float], vector_id: Optional[str] = None, namespace: Optional[str] = None, meta: Optional[dict] = None, **kwargs, ) -> str: """Add a vector to the index. Args: vector (list[float]): _description_ vector_id (Optional[str], optional): _description_. Defaults to None. namespace (Optional[str], optional): _description_. Defaults to None. meta (Optional[dict], optional): _description_. Defaults to None. Returns: str: _description_ """ vector_id = vector_id if vector_id else str_to_hash(str(vector)) params = {"namespace": namespace} | kwargs self.index.upsert([(vector_id, vector, meta)], **params) return vector_id def load_entries(self, namespace: Optional[str] = None): """Load all entries from the index. Args: namespace (Optional[str], optional): _description_. Defaults to None. Returns: _type_: _description_ """ # This is a hacky way to query up to 10,000 values from Pinecone. Waiting on an official API for fetching # all values from a namespace: # https://community.pinecone.io/t/is-there-a-way-to-query-all-the-vectors-and-or-metadata-from-a-namespace/797/5 results = self.index.query( self.embedding_driver.embed_string(""), top_k=10000, include_metadata=True, namespace=namespace, ) for result in results["matches"]: entry = { "id": result["id"], "vector": result["values"], "meta": result["metadata"], "namespace": result["namespace"], } return entry def query( self, query: str, count: Optional[int] = None, namespace: Optional[str] = None, include_vectors: bool = False, # PineconeDBStorageDriver-specific params: include_metadata=True, **kwargs, ): """Query the index for vectors similar to the given query string. Args: query (str): _description_ count (Optional[int], optional): _description_. Defaults to None. namespace (Optional[str], optional): _description_. Defaults to None. include_vectors (bool, optional): _description_. Defaults to False. include_metadata (bool, optional): _description_. Defaults to True. Returns: _type_: _description_ """ vector = self.embedding_driver.embed_string(query) params = { "top_k": count, "namespace": namespace, "include_values": include_vectors, "include_metadata": include_metadata, } | kwargs results = self.index.query(vector, **params) for r in results["matches"]: entry = { "id": results["id"], "vector": results["values"], "score": results["scores"], "meta": results["metadata"], "namespace": results["namespace"], } return entry def create_index(self, name: str, **kwargs) -> None: """Create a new index. Args: name (str): _description_ """ params = { "name": name, "dimension": self.embedding_driver.dimensions, } | kwargs pinecone.create_index(**params)