core[patch]: Document embeddings namespace (#23132)

Document embeddings namespace
2024-11-06 03:20:49 +00:00 · 2024-06-19 10:11:16 -04:00 · 2024-06-19 10:11:16 -04:00 · fe4f10047b
commit fe4f10047b
parent a3bae56a48
2 changed files with 59 additions and 8 deletions
--- a/libs/core/langchain_core/embeddings/embeddings.py
+++ b/libs/core/langchain_core/embeddings/embeddings.py
@ -6,7 +6,32 @@ from langchain_core.runnables.config import run_in_executor
 class Embeddings(ABC):
-    """Interface for embedding models."""
+    """An interface for embedding models.
    This is an interface meant for implementing text embedding models.
    Text embedding models are used to map text to a vector (a point in n-dimensional
    space).
    Texts that are similar will usually be mapped to points that are close to each
    other in this space. The exact details of what's considered "similar" and how
    "distance" is measured in this space are dependent on the specific embedding model.
    This abstraction contains a method for embedding a list of documents and a method
    for embedding a query text. The embedding of a query text is expected to be a single
    vector, while the embedding of a list of documents is expected to be a list of
    vectors.
    Usually the query embedding is identical to the document embedding, but the
    abstraction allows treating them independently.
    In addition to the synchronous methods, this interface also provides asynchronous
    versions of the methods.
    By default, the asynchronous methods are implemented using the synchronous methods;
    however, implementations may choose to override the asynchronous methods with
    an async native implementation for performance reasons.
    """
    @abstractmethod
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
--- a/libs/core/langchain_core/embeddings/fake.py
+++ b/libs/core/langchain_core/embeddings/fake.py
@ -1,3 +1,5 @@
 """Module contains a few fake embedding models for testing purposes."""
 # Please do not add additional fake embedding model implementations here.
 import hashlib
 from typing import List
@ -6,7 +8,21 @@ from langchain_core.pydantic_v1 import BaseModel
 class FakeEmbeddings(Embeddings, BaseModel):
-    """Fake embedding model."""
+    """Fake embedding model for unit testing purposes.
    This embedding model creates embeddings by sampling from a normal distribution.
    Do not use this outside of testing, as it is not a real embedding model.
    Example:
        .. code-block:: python
            from langchain_core.embeddings import FakeEmbeddings
            fake_embeddings = FakeEmbeddings(size=100)
            fake_embeddings.embed_documents(["hello world", "foo bar"])
    """
    size: int
    """The size of the embedding vector."""
@ -24,9 +40,21 @@ class FakeEmbeddings(Embeddings, BaseModel):
 class DeterministicFakeEmbedding(Embeddings, BaseModel):
-    """
+    """Deterministic fake embedding model for unit testing purposes.
-    Fake embedding model that always returns
+
-    the same embedding vector for the same text.
+    This embedding model creates embeddings by sampling from a normal distribution
    with a seed based on the hash of the text.
    Do not use this outside of testing, as it is not a real embedding model.
    Example:
        .. code-block:: python
            from langchain_core.embeddings import DeterministicFakeEmbedding
            fake_embeddings = DeterministicFakeEmbedding(size=100)
            fake_embeddings.embed_documents(["hello world", "foo bar"])
    """
    size: int
@ -40,9 +68,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
        return list(np.random.normal(size=self.size))
    def _get_seed(self, text: str) -> int:
-        """
+        """Get a seed for the random generator, using the hash of the text."""
        Get a seed for the random generator, using the hash of the text.
        """
        return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8
    def embed_documents(self, texts: List[str]) -> List[List[float]]: