core[patch]: Document embeddings namespace (#23132)

Document embeddings namespace
3 months ago · fe4f10047b
parent a3bae56a48
commit fe4f10047b
2 changed files with 59 additions and 8 deletions
--- a/libs/core/langchain_core/embeddings/embeddings.py
+++ b/libs/core/langchain_core/embeddings/embeddings.py
@ -6,7 +6,32 @@ from langchain_core.runnables.config import run_in_executor


 class Embeddings(ABC):
-    """Interface for embedding models."""
+    """An interface for embedding models.
+
+    This is an interface meant for implementing text embedding models.
+
+    Text embedding models are used to map text to a vector (a point in n-dimensional
+    space).
+
+    Texts that are similar will usually be mapped to points that are close to each
+    other in this space. The exact details of what's considered "similar" and how
+    "distance" is measured in this space are dependent on the specific embedding model.
+
+    This abstraction contains a method for embedding a list of documents and a method
+    for embedding a query text. The embedding of a query text is expected to be a single
+    vector, while the embedding of a list of documents is expected to be a list of
+    vectors.
+
+    Usually the query embedding is identical to the document embedding, but the
+    abstraction allows treating them independently.
+
+    In addition to the synchronous methods, this interface also provides asynchronous
+    versions of the methods.
+
+    By default, the asynchronous methods are implemented using the synchronous methods;
+    however, implementations may choose to override the asynchronous methods with
+    an async native implementation for performance reasons.
+    """

    @abstractmethod
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
--- a/libs/core/langchain_core/embeddings/fake.py
+++ b/libs/core/langchain_core/embeddings/fake.py
@ -1,3 +1,5 @@
+"""Module contains a few fake embedding models for testing purposes."""
+# Please do not add additional fake embedding model implementations here.
 import hashlib
 from typing import List

@ -6,7 +8,21 @@ from langchain_core.pydantic_v1 import BaseModel


 class FakeEmbeddings(Embeddings, BaseModel):
-    """Fake embedding model."""
+    """Fake embedding model for unit testing purposes.
+
+    This embedding model creates embeddings by sampling from a normal distribution.
+
+    Do not use this outside of testing, as it is not a real embedding model.
+
+    Example:
+
+        .. code-block:: python
+
+            from langchain_core.embeddings import FakeEmbeddings
+
+            fake_embeddings = FakeEmbeddings(size=100)
+            fake_embeddings.embed_documents(["hello world", "foo bar"])
+    """

    size: int
    """The size of the embedding vector."""
@ -24,9 +40,21 @@ class FakeEmbeddings(Embeddings, BaseModel):


 class DeterministicFakeEmbedding(Embeddings, BaseModel):
-    """
-    Fake embedding model that always returns
-    the same embedding vector for the same text.
+    """Deterministic fake embedding model for unit testing purposes.
+
+    This embedding model creates embeddings by sampling from a normal distribution
+    with a seed based on the hash of the text.
+
+    Do not use this outside of testing, as it is not a real embedding model.
+
+    Example:
+
+        .. code-block:: python
+
+            from langchain_core.embeddings import DeterministicFakeEmbedding
+
+            fake_embeddings = DeterministicFakeEmbedding(size=100)
+            fake_embeddings.embed_documents(["hello world", "foo bar"])
    """

    size: int
@ -40,9 +68,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
        return list(np.random.normal(size=self.size))

    def _get_seed(self, text: str) -> int:
-        """
-        Get a seed for the random generator, using the hash of the text.
-        """
+        """Get a seed for the random generator, using the hash of the text."""
        return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8

    def embed_documents(self, texts: List[str]) -> List[List[float]]: