pine cone vector db

Former-commit-id: a5dcc0f175
2 years ago · 95d5834666
parent 9faf2025f0
commit 95d5834666
2 changed files with 209 additions and 0 deletions
--- a/swarms/memory/vector_stores/pinecone.py
+++ b/swarms/memory/vector_stores/pinecone.py
@ -0,0 +1,197 @@
+from typing import Optional
+from swarms.memory.vector_stores.base import BaseVector
+import pinecone
+from attr import define, field
+from swarms.utils.hash import str_to_hash
+
+
+@define
+class PineconeVector(BaseVector):
+    """
+    PineconeVector is a vector storage driver that uses Pinecone as the underlying storage engine.
+
+    Pinecone is a vector database that allows you to store, search, and retrieve high-dimensional vectors with
+    blazing speed and low latency. It is a managed service that is easy to use and scales effortlessly, so you can
+    focus on building your applications instead of managing your infrastructure.
+
+    Args:
+        api_key (str): The API key for your Pinecone account.
+        index_name (str): The name of the index to use.
+        environment (str): The environment to use. Either "us-west1-gcp" or "us-east1-gcp".
+        project_name (str, optional): The name of the project to use. Defaults to None.
+        index (pinecone.Index, optional): The Pinecone index to use. Defaults to None.
+    
+    Methods:
+        upsert_vector(vector: list[float], vector_id: Optional[str] = None, namespace: Optional[str] = None, meta: Optional[dict] = None, **kwargs) -> str:
+            Upserts a vector into the index.
+        load_entry(vector_id: str, namespace: Optional[str] = None) -> Optional[BaseVector.Entry]:
+            Loads a single vector from the index.
+        load_entries(namespace: Optional[str] = None) -> list[BaseVector.Entry]:
+            Loads all vectors from the index.
+        query(query: str, count: Optional[int] = None, namespace: Optional[str] = None, include_vectors: bool = False, include_metadata=True, **kwargs) -> list[BaseVector.QueryResult]:
+            Queries the index for vectors similar to the given query string.
+        create_index(name: str, **kwargs) -> None:
+            Creates a new index.
+    
+    Usage:
+    >>> from swarms.memory.vector_stores.pinecone import PineconeVector
+    >>> from swarms.utils.embeddings import USEEmbedding
+    >>> from swarms.utils.hash import str_to_hash
+    >>> from swarms.utils.dataframe import dataframe_to_hash
+    >>> import pandas as pd
+    >>>
+    >>> # Create a new PineconeVector instance:
+    >>> pv = PineconeVector(
+    >>>     api_key="your-api-key",
+    >>>     index_name="your-index-name",
+    >>>     environment="us-west1-gcp",
+    >>>     project_name="your-project-name"
+    >>> )
+    >>> # Create a new index:
+    >>> pv.create_index("your-index-name")
+    >>> # Create a new USEEmbedding instance:
+    >>> use = USEEmbedding()
+    >>> # Create a new dataframe:
+    >>> df = pd.DataFrame({
+    >>>     "text": [
+    >>>         "This is a test",
+    >>>         "This is another test",
+    >>>         "This is a third test"
+    >>>     ]
+    >>> })
+    >>> # Embed the dataframe:
+    >>> df["embedding"] = df["text"].apply(use.embed_string)
+    >>> # Upsert the dataframe into the index:
+    >>> pv.upsert_vector(
+    >>>     vector=df["embedding"].tolist(),
+    >>>     vector_id=dataframe_to_hash(df),
+    >>>     namespace="your-namespace"
+    >>> )
+    >>> # Query the index:
+    >>> pv.query(
+    >>>     query="This is a test",
+    >>>     count=10,
+    >>>     namespace="your-namespace"
+    >>> )
+    >>> # Load a single entry from the index:
+    >>> pv.load_entry(
+    >>>     vector_id=dataframe_to_hash(df),
+    >>>     namespace="your-namespace"
+    >>> )
+    >>> # Load all entries from the index:
+    >>> pv.load_entries(
+    >>>     namespace="your-namespace"
+    >>> )
+    
+    
+    """
+    api_key: str = field(kw_only=True)
+    index_name: str = field(kw_only=True)
+    environment: str = field(kw_only=True)
+    project_name: Optional[str] = field(default=None, kw_only=True)
+    index: pinecone.Index = field(init=False)
+
+    def __attrs_post_init__(self) -> None:
+        pinecone.init(
+            api_key=self.api_key,
+            environment=self.environment,
+            project_name=self.project_name
+        )
+
+        self.index = pinecone.Index(self.index_name)
+
+    def upsert_vector(
+            self,
+            vector: list[float],
+            vector_id: Optional[str] = None,
+            namespace: Optional[str] = None,
+            meta: Optional[dict] = None,
+            **kwargs
+    ) -> str:
+        vector_id = vector_id if vector_id else str_to_hash(str(vector))
+
+        params = {
+            "namespace": namespace
+        } | kwargs
+
+        self.index.upsert([(vector_id, vector, meta)], **params)
+
+        return vector_id
+
+    def load_entry(self, vector_id: str, namespace: Optional[str] = None) -> Optional[BaseVector.Entry]:
+        result = self.index.fetch(ids=[vector_id], namespace=namespace).to_dict()
+        vectors = list(result["vectors"].values())
+
+        if len(vectors) > 0:
+            vector = vectors[0]
+
+            return BaseVector.Entry(
+                id=vector["id"],
+                meta=vector["metadata"],
+                vector=vector["values"],
+                namespace=result["namespace"]
+            )
+        else:
+            return None
+
+    def load_entries(self, namespace: Optional[str] = None) -> list[BaseVector.Entry]:
+        # This is a hacky way to query up to 10,000 values from Pinecone. Waiting on an official API for fetching
+        # all values from a namespace:
+        # https://community.pinecone.io/t/is-there-a-way-to-query-all-the-vectors-and-or-metadata-from-a-namespace/797/5
+
+        results = self.index.query(
+            self.embedding_driver.embed_string(""),
+            top_k=10000,
+            include_metadata=True,
+            namespace=namespace
+        )
+
+        return [
+            BaseVector.Entry(
+                id=r["id"],
+                vector=r["values"],
+                meta=r["metadata"],
+                namespace=results["namespace"]
+            )
+            for r in results["matches"]
+        ]
+
+    def query(
+            self,
+            query: str,
+            count: Optional[int] = None,
+            namespace: Optional[str] = None,
+            include_vectors: bool = False,
+            # PineconeVectorStorageDriver-specific params:
+            include_metadata=True,
+            **kwargs
+    ) -> list[BaseVector.QueryResult]:
+        vector = self.embedding_driver.embed_string(query)
+
+        params = {
+            "top_k": count if count else BaseVector.DEFAULT_QUERY_COUNT,
+            "namespace": namespace,
+            "include_values": include_vectors,
+            "include_metadata": include_metadata
+        } | kwargs
+
+        results = self.index.query(vector, **params)
+
+        return [
+            BaseVector.QueryResult(
+                id=r["id"],
+                vector=r["values"],
+                score=r["score"],
+                meta=r["metadata"],
+                namespace=results["namespace"]
+            )
+            for r in results["matches"]
+        ]
+
+    def create_index(self, name: str, **kwargs) -> None:
+        params = {
+            "name": name,
+            "dimension": self.embedding_driver.dimensions
+        } | kwargs
+
+        pinecone.create_index(**params)
--- a/swarms/utils/hash.py
+++ b/swarms/utils/hash.py
@ -0,0 +1,12 @@
+import pandas as pd
+import hashlib
+
+def dataframe_to_hash(dataframe: pd.DataFrame) -> str:
+    return hashlib.sha256(pd.util.hash_pandas_object(dataframe, index=True).values).hexdigest()
+
+def str_to_hash(text: str, hash_algorithm: str = "sha256") -> str:
+    m = hashlib.new(hash_algorithm)
+
+    m.update(text.encode())
+
+    return m.hexdigest()