From fe4f10047bc67a68ce2b5b03b2b8b499031e68ba Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 19 Jun 2024 10:11:16 -0400 Subject: [PATCH] core[patch]: Document embeddings namespace (#23132) Document embeddings namespace --- .../langchain_core/embeddings/embeddings.py | 27 ++++++++++++- libs/core/langchain_core/embeddings/fake.py | 40 +++++++++++++++---- 2 files changed, 59 insertions(+), 8 deletions(-) diff --git a/libs/core/langchain_core/embeddings/embeddings.py b/libs/core/langchain_core/embeddings/embeddings.py index d48d4ecdf4..9ab9bcc407 100644 --- a/libs/core/langchain_core/embeddings/embeddings.py +++ b/libs/core/langchain_core/embeddings/embeddings.py @@ -6,7 +6,32 @@ from langchain_core.runnables.config import run_in_executor class Embeddings(ABC): - """Interface for embedding models.""" + """An interface for embedding models. + + This is an interface meant for implementing text embedding models. + + Text embedding models are used to map text to a vector (a point in n-dimensional + space). + + Texts that are similar will usually be mapped to points that are close to each + other in this space. The exact details of what's considered "similar" and how + "distance" is measured in this space are dependent on the specific embedding model. + + This abstraction contains a method for embedding a list of documents and a method + for embedding a query text. The embedding of a query text is expected to be a single + vector, while the embedding of a list of documents is expected to be a list of + vectors. + + Usually the query embedding is identical to the document embedding, but the + abstraction allows treating them independently. + + In addition to the synchronous methods, this interface also provides asynchronous + versions of the methods. + + By default, the asynchronous methods are implemented using the synchronous methods; + however, implementations may choose to override the asynchronous methods with + an async native implementation for performance reasons. + """ @abstractmethod def embed_documents(self, texts: List[str]) -> List[List[float]]: diff --git a/libs/core/langchain_core/embeddings/fake.py b/libs/core/langchain_core/embeddings/fake.py index 3a9f103d2a..ccf22996c7 100644 --- a/libs/core/langchain_core/embeddings/fake.py +++ b/libs/core/langchain_core/embeddings/fake.py @@ -1,3 +1,5 @@ +"""Module contains a few fake embedding models for testing purposes.""" +# Please do not add additional fake embedding model implementations here. import hashlib from typing import List @@ -6,7 +8,21 @@ from langchain_core.pydantic_v1 import BaseModel class FakeEmbeddings(Embeddings, BaseModel): - """Fake embedding model.""" + """Fake embedding model for unit testing purposes. + + This embedding model creates embeddings by sampling from a normal distribution. + + Do not use this outside of testing, as it is not a real embedding model. + + Example: + + .. code-block:: python + + from langchain_core.embeddings import FakeEmbeddings + + fake_embeddings = FakeEmbeddings(size=100) + fake_embeddings.embed_documents(["hello world", "foo bar"]) + """ size: int """The size of the embedding vector.""" @@ -24,9 +40,21 @@ class FakeEmbeddings(Embeddings, BaseModel): class DeterministicFakeEmbedding(Embeddings, BaseModel): - """ - Fake embedding model that always returns - the same embedding vector for the same text. + """Deterministic fake embedding model for unit testing purposes. + + This embedding model creates embeddings by sampling from a normal distribution + with a seed based on the hash of the text. + + Do not use this outside of testing, as it is not a real embedding model. + + Example: + + .. code-block:: python + + from langchain_core.embeddings import DeterministicFakeEmbedding + + fake_embeddings = DeterministicFakeEmbedding(size=100) + fake_embeddings.embed_documents(["hello world", "foo bar"]) """ size: int @@ -40,9 +68,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel): return list(np.random.normal(size=self.size)) def _get_seed(self, text: str) -> int: - """ - Get a seed for the random generator, using the hash of the text. - """ + """Get a seed for the random generator, using the hash of the text.""" return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8 def embed_documents(self, texts: List[str]) -> List[List[float]]: