core[patch]: Document embeddings namespace (#23132)

Document embeddings namespace
This commit is contained in:
Eugene Yurtsev 2024-06-19 10:11:16 -04:00 committed by GitHub
parent a3bae56a48
commit fe4f10047b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 59 additions and 8 deletions

View File

@ -6,7 +6,32 @@ from langchain_core.runnables.config import run_in_executor
class Embeddings(ABC):
"""Interface for embedding models."""
"""An interface for embedding models.
This is an interface meant for implementing text embedding models.
Text embedding models are used to map text to a vector (a point in n-dimensional
space).
Texts that are similar will usually be mapped to points that are close to each
other in this space. The exact details of what's considered "similar" and how
"distance" is measured in this space are dependent on the specific embedding model.
This abstraction contains a method for embedding a list of documents and a method
for embedding a query text. The embedding of a query text is expected to be a single
vector, while the embedding of a list of documents is expected to be a list of
vectors.
Usually the query embedding is identical to the document embedding, but the
abstraction allows treating them independently.
In addition to the synchronous methods, this interface also provides asynchronous
versions of the methods.
By default, the asynchronous methods are implemented using the synchronous methods;
however, implementations may choose to override the asynchronous methods with
an async native implementation for performance reasons.
"""
@abstractmethod
def embed_documents(self, texts: List[str]) -> List[List[float]]:

View File

@ -1,3 +1,5 @@
"""Module contains a few fake embedding models for testing purposes."""
# Please do not add additional fake embedding model implementations here.
import hashlib
from typing import List
@ -6,7 +8,21 @@ from langchain_core.pydantic_v1 import BaseModel
class FakeEmbeddings(Embeddings, BaseModel):
"""Fake embedding model."""
"""Fake embedding model for unit testing purposes.
This embedding model creates embeddings by sampling from a normal distribution.
Do not use this outside of testing, as it is not a real embedding model.
Example:
.. code-block:: python
from langchain_core.embeddings import FakeEmbeddings
fake_embeddings = FakeEmbeddings(size=100)
fake_embeddings.embed_documents(["hello world", "foo bar"])
"""
size: int
"""The size of the embedding vector."""
@ -24,9 +40,21 @@ class FakeEmbeddings(Embeddings, BaseModel):
class DeterministicFakeEmbedding(Embeddings, BaseModel):
"""
Fake embedding model that always returns
the same embedding vector for the same text.
"""Deterministic fake embedding model for unit testing purposes.
This embedding model creates embeddings by sampling from a normal distribution
with a seed based on the hash of the text.
Do not use this outside of testing, as it is not a real embedding model.
Example:
.. code-block:: python
from langchain_core.embeddings import DeterministicFakeEmbedding
fake_embeddings = DeterministicFakeEmbedding(size=100)
fake_embeddings.embed_documents(["hello world", "foo bar"])
"""
size: int
@ -40,9 +68,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
return list(np.random.normal(size=self.size))
def _get_seed(self, text: str) -> int:
"""
Get a seed for the random generator, using the hash of the text.
"""
"""Get a seed for the random generator, using the hash of the text."""
return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8
def embed_documents(self, texts: List[str]) -> List[List[float]]: