from __future__ import annotations import asyncio from abc import ABC, abstractmethod from functools import partial from typing import Any, Literal, Sequence from langchain.load.serializable import Serializable from pydantic import Field class Document(Serializable): """Class for storing a piece of text and associated metadata.""" page_content: str """String text.""" metadata: dict = Field(default_factory=dict) """Arbitrary metadata about the page content (e.g., source, relationships to other documents, etc.). """ type: Literal["Document"] = "Document" @classmethod def is_lc_serializable(cls) -> bool: """Return whether this class is serializable.""" return True class BaseDocumentTransformer(ABC): """Abstract base class for document transformation systems. A document transformation system takes a sequence of Documents and returns a sequence of transformed Documents. Example: .. code-block:: python class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel): embeddings: Embeddings similarity_fn: Callable = cosine_similarity similarity_threshold: float = 0.95 class Config: arbitrary_types_allowed = True def transform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: stateful_documents = get_stateful_documents(documents) embedded_documents = _get_embeddings_from_stateful_docs( self.embeddings, stateful_documents ) included_idxs = _filter_similar_embeddings( embedded_documents, self.similarity_fn, self.similarity_threshold ) return [stateful_documents[i] for i in sorted(included_idxs)] async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: raise NotImplementedError """ # noqa: E501 @abstractmethod def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]: """Transform a list of documents. Args: documents: A sequence of Documents to be transformed. Returns: A list of transformed Documents. """ async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]: """Asynchronously transform a list of documents. Args: documents: A sequence of Documents to be transformed. Returns: A list of transformed Documents. """ return await asyncio.get_running_loop().run_in_executor( None, partial(self.transform_documents, **kwargs), documents)