From fe4f10047bc67a68ce2b5b03b2b8b499031e68ba Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev <eyurtsev@gmail.com>
Date: Wed, 19 Jun 2024 10:11:16 -0400
Subject: [PATCH] core[patch]: Document embeddings namespace (#23132)

Document embeddings namespace
---
 .../langchain_core/embeddings/embeddings.py   | 27 ++++++++++++-
 libs/core/langchain_core/embeddings/fake.py   | 40 +++++++++++++++----
 2 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/libs/core/langchain_core/embeddings/embeddings.py b/libs/core/langchain_core/embeddings/embeddings.py
index d48d4ecdf4..9ab9bcc407 100644
--- a/libs/core/langchain_core/embeddings/embeddings.py
+++ b/libs/core/langchain_core/embeddings/embeddings.py
@@ -6,7 +6,32 @@ from langchain_core.runnables.config import run_in_executor
 
 
 class Embeddings(ABC):
-    """Interface for embedding models."""
+    """An interface for embedding models.
+
+    This is an interface meant for implementing text embedding models.
+
+    Text embedding models are used to map text to a vector (a point in n-dimensional
+    space).
+
+    Texts that are similar will usually be mapped to points that are close to each
+    other in this space. The exact details of what's considered "similar" and how
+    "distance" is measured in this space are dependent on the specific embedding model.
+
+    This abstraction contains a method for embedding a list of documents and a method
+    for embedding a query text. The embedding of a query text is expected to be a single
+    vector, while the embedding of a list of documents is expected to be a list of
+    vectors.
+
+    Usually the query embedding is identical to the document embedding, but the
+    abstraction allows treating them independently.
+
+    In addition to the synchronous methods, this interface also provides asynchronous
+    versions of the methods.
+
+    By default, the asynchronous methods are implemented using the synchronous methods;
+    however, implementations may choose to override the asynchronous methods with
+    an async native implementation for performance reasons.
+    """
 
     @abstractmethod
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
diff --git a/libs/core/langchain_core/embeddings/fake.py b/libs/core/langchain_core/embeddings/fake.py
index 3a9f103d2a..ccf22996c7 100644
--- a/libs/core/langchain_core/embeddings/fake.py
+++ b/libs/core/langchain_core/embeddings/fake.py
@@ -1,3 +1,5 @@
+"""Module contains a few fake embedding models for testing purposes."""
+# Please do not add additional fake embedding model implementations here.
 import hashlib
 from typing import List
 
@@ -6,7 +8,21 @@ from langchain_core.pydantic_v1 import BaseModel
 
 
 class FakeEmbeddings(Embeddings, BaseModel):
-    """Fake embedding model."""
+    """Fake embedding model for unit testing purposes.
+
+    This embedding model creates embeddings by sampling from a normal distribution.
+
+    Do not use this outside of testing, as it is not a real embedding model.
+
+    Example:
+
+        .. code-block:: python
+
+            from langchain_core.embeddings import FakeEmbeddings
+
+            fake_embeddings = FakeEmbeddings(size=100)
+            fake_embeddings.embed_documents(["hello world", "foo bar"])
+    """
 
     size: int
     """The size of the embedding vector."""
@@ -24,9 +40,21 @@ class FakeEmbeddings(Embeddings, BaseModel):
 
 
 class DeterministicFakeEmbedding(Embeddings, BaseModel):
-    """
-    Fake embedding model that always returns
-    the same embedding vector for the same text.
+    """Deterministic fake embedding model for unit testing purposes.
+
+    This embedding model creates embeddings by sampling from a normal distribution
+    with a seed based on the hash of the text.
+
+    Do not use this outside of testing, as it is not a real embedding model.
+
+    Example:
+
+        .. code-block:: python
+
+            from langchain_core.embeddings import DeterministicFakeEmbedding
+
+            fake_embeddings = DeterministicFakeEmbedding(size=100)
+            fake_embeddings.embed_documents(["hello world", "foo bar"])
     """
 
     size: int
@@ -40,9 +68,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
         return list(np.random.normal(size=self.size))
 
     def _get_seed(self, text: str) -> int:
-        """
-        Get a seed for the random generator, using the hash of the text.
-        """
+        """Get a seed for the random generator, using the hash of the text."""
         return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8
 
     def embed_documents(self, texts: List[str]) -> List[List[float]]: