mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
core[patch]: Deprecating beta upsert APIs in vectorstore (#25069)
This PR deprecates the beta upsert APIs in vectorstore. We'll introduce them in a V2 abstraction instead to keep the existing vectorstore implementations lighter weight. The main problem with the existing APIs is that it's a bit more challenging to implement the correct behavior w/ respect to IDs since ID can be present in both the function signature and as an optional attribute on the document object. But VectorStores that pass the standard tests should have implemented the semantics properly! --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
ca9dcee940
commit
6dd9f053e3
@ -460,7 +460,45 @@ class ApertureDB(VectorStore):
|
||||
assert db.last_query_ok(), response
|
||||
return response[0]["FindDescriptorSet"]["entities"]
|
||||
|
||||
@override
|
||||
def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
|
||||
"""Add or update documents in the vectorstore.
|
||||
|
||||
Args:
|
||||
documents: Documents to add to the vectorstore.
|
||||
kwargs: Additional keyword arguments.
|
||||
if kwargs contains ids and documents contain ids,
|
||||
the ids in the kwargs will receive precedence.
|
||||
|
||||
Returns:
|
||||
List of IDs of the added texts.
|
||||
|
||||
Raises:
|
||||
ValueError: If the number of ids does not match the number of documents.
|
||||
"""
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(documents):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of documents. "
|
||||
"Got {len(ids)} ids and {len(documents)} documents."
|
||||
)
|
||||
|
||||
documents_ = []
|
||||
|
||||
for id_, document in zip(ids, documents):
|
||||
doc_with_id = Document(
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
id=id_,
|
||||
)
|
||||
documents_.append(doc_with_id)
|
||||
else:
|
||||
documents_ = documents
|
||||
|
||||
# If upsert has been implemented, we can use it to add documents
|
||||
return self.upsert(documents_, **kwargs)["succeeded"]
|
||||
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
"""Insert or update items
|
||||
|
||||
|
@ -29,30 +29,23 @@ from itertools import cycle
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncIterable,
|
||||
AsyncIterator,
|
||||
Callable,
|
||||
ClassVar,
|
||||
Collection,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.pydantic_v1 import Field, root_validator
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
from langchain_core.runnables.config import run_in_executor
|
||||
from langchain_core.utils.aiter import abatch_iterate
|
||||
from langchain_core.utils.iter import batch_iterate
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.callbacks.manager import (
|
||||
@ -60,7 +53,6 @@ if TYPE_CHECKING:
|
||||
CallbackManagerForRetrieverRun,
|
||||
)
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -96,7 +88,7 @@ class VectorStore(ABC):
|
||||
ValueError: If the number of metadatas does not match the number of texts.
|
||||
ValueError: If the number of ids does not match the number of texts.
|
||||
"""
|
||||
if type(self).upsert != VectorStore.upsert:
|
||||
if type(self).add_documents != VectorStore.add_documents:
|
||||
# Import document in local scope to avoid circular imports
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -109,190 +101,19 @@ class VectorStore(ABC):
|
||||
if metadatas and len(metadatas) != len(texts_):
|
||||
raise ValueError(
|
||||
"The number of metadatas must match the number of texts."
|
||||
"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||
f"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||
)
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(texts_):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of texts."
|
||||
"Got {len(ids)} ids and {len(texts_)} texts."
|
||||
)
|
||||
else:
|
||||
ids = None
|
||||
|
||||
metadatas_ = iter(metadatas) if metadatas else cycle([{}])
|
||||
ids_: Iterable[Union[str, None]] = ids if ids is not None else cycle([None])
|
||||
docs = [
|
||||
Document(page_content=text, metadata=metadata_, id=id_)
|
||||
for text, metadata_, id_ in zip(texts, metadatas_, ids_)
|
||||
Document(page_content=text, metadata=metadata_)
|
||||
for text, metadata_ in zip(texts, metadatas_)
|
||||
]
|
||||
upsert_response = self.upsert(docs, **kwargs)
|
||||
return upsert_response["succeeded"]
|
||||
|
||||
return self.add_documents(docs, **kwargs)
|
||||
raise NotImplementedError(
|
||||
f"`add_texts` has not been implemented for {self.__class__.__name__} "
|
||||
)
|
||||
|
||||
# Developer guidelines:
|
||||
# Do not override streaming_upsert!
|
||||
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||
def streaming_upsert(
|
||||
self, items: Iterable[Document], /, batch_size: int, **kwargs: Any
|
||||
) -> Iterator[UpsertResponse]:
|
||||
"""Upsert documents in a streaming fashion.
|
||||
|
||||
Args:
|
||||
items: Iterable of Documents to add to the vectorstore.
|
||||
batch_size: The size of each batch to upsert.
|
||||
kwargs: Additional keyword arguments.
|
||||
kwargs should only include parameters that are common to all
|
||||
documents. (e.g., timeout for indexing, retry policy, etc.)
|
||||
kwargs should not include ids to avoid ambiguous semantics.
|
||||
Instead, the ID should be provided as part of the Document object.
|
||||
|
||||
Yields:
|
||||
UpsertResponse: A response object that contains the list of IDs that were
|
||||
successfully added or updated in the vectorstore and the list of IDs that
|
||||
failed to be added or updated.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
# The default implementation of this method breaks the input into
|
||||
# batches of size `batch_size` and calls the `upsert` method on each batch.
|
||||
# Subclasses can override this method to provide a more efficient
|
||||
# implementation.
|
||||
for item_batch in batch_iterate(batch_size, items):
|
||||
yield self.upsert(item_batch, **kwargs)
|
||||
|
||||
# Please note that we've added a new method `upsert` instead of re-using the
|
||||
# existing `add_documents` method.
|
||||
# This was done to resolve potential ambiguities around the behavior of **kwargs
|
||||
# in existing add_documents / add_texts methods which could include per document
|
||||
# information (e.g., the `ids` parameter).
|
||||
# Over time the `add_documents` could be denoted as legacy and deprecated
|
||||
# in favor of the `upsert` method.
|
||||
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
"""Add or update documents in the vectorstore.
|
||||
|
||||
The upsert functionality should utilize the ID field of the Document object
|
||||
if it is provided. If the ID is not provided, the upsert method is free
|
||||
to generate an ID for the document.
|
||||
|
||||
When an ID is specified and the document already exists in the vectorstore,
|
||||
the upsert method should update the document with the new data. If the document
|
||||
does not exist, the upsert method should add the document to the vectorstore.
|
||||
|
||||
Args:
|
||||
items: Sequence of Documents to add to the vectorstore.
|
||||
kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
UpsertResponse: A response object that contains the list of IDs that were
|
||||
successfully added or updated in the vectorstore and the list of IDs that
|
||||
failed to be added or updated.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
# Developer guidelines:
|
||||
#
|
||||
# Vectorstores implementations are free to extend `upsert` implementation
|
||||
# to take in additional data per document.
|
||||
#
|
||||
# This data **SHOULD NOT** be part of the **kwargs** parameter, instead
|
||||
# sub-classes can use a Union type on `documents` to include additional
|
||||
# supported formats for the input data stream.
|
||||
#
|
||||
# For example,
|
||||
#
|
||||
# .. code-block:: python
|
||||
# from typing import TypedDict
|
||||
#
|
||||
# class DocumentWithVector(TypedDict):
|
||||
# document: Document
|
||||
# vector: List[float]
|
||||
#
|
||||
# def upsert(
|
||||
# self,
|
||||
# documents: Union[Iterable[Document], Iterable[DocumentWithVector]],
|
||||
# /,
|
||||
# **kwargs
|
||||
# ) -> UpsertResponse:
|
||||
# \"\"\"Add or update documents in the vectorstore.\"\"\"
|
||||
# # Implementation should check if documents is an
|
||||
# # iterable of DocumentWithVector or Document
|
||||
# pass
|
||||
#
|
||||
# Implementations that override upsert should include a new doc-string
|
||||
# that explains the semantics of upsert and includes in code
|
||||
# examples of how to insert using the alternate data formats.
|
||||
|
||||
# The implementation does not delegate to the `add_texts` method or
|
||||
# the `add_documents` method by default since those implementations
|
||||
raise NotImplementedError(
|
||||
f"upsert has not been implemented for {self.__class__.__name__}"
|
||||
)
|
||||
|
||||
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||
async def astreaming_upsert(
|
||||
self,
|
||||
items: AsyncIterable[Document],
|
||||
/,
|
||||
batch_size: int,
|
||||
**kwargs: Any,
|
||||
) -> AsyncIterator[UpsertResponse]:
|
||||
"""Upsert documents in a streaming fashion. Async version of streaming_upsert.
|
||||
|
||||
Args:
|
||||
items: Iterable of Documents to add to the vectorstore.
|
||||
batch_size: The size of each batch to upsert.
|
||||
kwargs: Additional keyword arguments.
|
||||
kwargs should only include parameters that are common to all
|
||||
documents. (e.g., timeout for indexing, retry policy, etc.)
|
||||
kwargs should not include ids to avoid ambiguous semantics.
|
||||
Instead the ID should be provided as part of the Document object.
|
||||
|
||||
Yields:
|
||||
UpsertResponse: A response object that contains the list of IDs that were
|
||||
successfully added or updated in the vectorstore and the list of IDs that
|
||||
failed to be added or updated.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
async for batch in abatch_iterate(batch_size, items):
|
||||
yield await self.aupsert(batch, **kwargs)
|
||||
|
||||
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||
async def aupsert(
|
||||
self, items: Sequence[Document], /, **kwargs: Any
|
||||
) -> UpsertResponse:
|
||||
"""Add or update documents in the vectorstore. Async version of upsert.
|
||||
|
||||
The upsert functionality should utilize the ID field of the Document object
|
||||
if it is provided. If the ID is not provided, the upsert method is free
|
||||
to generate an ID for the document.
|
||||
|
||||
When an ID is specified and the document already exists in the vectorstore,
|
||||
the upsert method should update the document with the new data. If the document
|
||||
does not exist, the upsert method should add the document to the vectorstore.
|
||||
|
||||
Args:
|
||||
items: Sequence of Documents to add to the vectorstore.
|
||||
kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
UpsertResponse: A response object that contains the list of IDs that were
|
||||
successfully added or updated in the vectorstore and the list of IDs that
|
||||
failed to be added or updated.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
# Developer guidelines: See guidelines for the `upsert` method.
|
||||
# The implementation does not delegate to the `add_texts` method or
|
||||
# the `add_documents` method by default since those implementations
|
||||
return await run_in_executor(None, self.upsert, items, **kwargs)
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Optional[Embeddings]:
|
||||
"""Access the query embedding object if available."""
|
||||
@ -407,7 +228,7 @@ class VectorStore(ABC):
|
||||
ValueError: If the number of metadatas does not match the number of texts.
|
||||
ValueError: If the number of ids does not match the number of texts.
|
||||
"""
|
||||
if type(self).aupsert != VectorStore.aupsert:
|
||||
if type(self).aadd_documents != VectorStore.aadd_documents:
|
||||
# Import document in local scope to avoid circular imports
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -420,27 +241,16 @@ class VectorStore(ABC):
|
||||
if metadatas and len(metadatas) != len(texts_):
|
||||
raise ValueError(
|
||||
"The number of metadatas must match the number of texts."
|
||||
"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||
f"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||
)
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(texts_):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of texts."
|
||||
"Got {len(ids)} ids and {len(texts_)} texts."
|
||||
)
|
||||
else:
|
||||
ids = None
|
||||
|
||||
metadatas_ = iter(metadatas) if metadatas else cycle([{}])
|
||||
ids_: Iterable[Union[str, None]] = ids if ids is not None else cycle([None])
|
||||
|
||||
docs = [
|
||||
Document(page_content=text, metadata=metadata_, id=id_)
|
||||
for text, metadata_, id_ in zip(texts, metadatas_, ids_)
|
||||
Document(page_content=text, metadata=metadata_)
|
||||
for text, metadata_ in zip(texts, metadatas_)
|
||||
]
|
||||
upsert_response = await self.aupsert(docs, **kwargs)
|
||||
return upsert_response["succeeded"]
|
||||
|
||||
return await self.aadd_documents(docs, **kwargs)
|
||||
return await run_in_executor(None, self.add_texts, texts, metadatas, **kwargs)
|
||||
|
||||
def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
|
||||
@ -458,37 +268,22 @@ class VectorStore(ABC):
|
||||
Raises:
|
||||
ValueError: If the number of ids does not match the number of documents.
|
||||
"""
|
||||
if type(self).upsert != VectorStore.upsert:
|
||||
from langchain_core.documents import Document
|
||||
if type(self).add_texts != VectorStore.add_texts:
|
||||
if "ids" not in kwargs:
|
||||
ids = [doc.id for doc in documents]
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(documents):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of documents. "
|
||||
"Got {len(ids)} ids and {len(documents)} documents."
|
||||
)
|
||||
# If there's at least one valid ID, we'll assume that IDs
|
||||
# should be used.
|
||||
if any(ids):
|
||||
kwargs["ids"] = ids
|
||||
|
||||
documents_ = []
|
||||
|
||||
for id_, document in zip(ids, documents):
|
||||
doc_with_id = Document(
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
id=id_,
|
||||
)
|
||||
documents_.append(doc_with_id)
|
||||
else:
|
||||
documents_ = documents
|
||||
|
||||
# If upsert has been implemented, we can use it to add documents
|
||||
return self.upsert(documents_, **kwargs)["succeeded"]
|
||||
|
||||
# Code path that delegates to add_text for backwards compatibility
|
||||
# TODO: Handle the case where the user doesn't provide ids on the Collection
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return self.add_texts(texts, metadatas, **kwargs)
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return self.add_texts(texts, metadatas, **kwargs)
|
||||
raise NotImplementedError(
|
||||
f"`add_documents` and `add_texts` has not been implemented "
|
||||
f"for {self.__class__.__name__} "
|
||||
)
|
||||
|
||||
async def aadd_documents(
|
||||
self, documents: List[Document], **kwargs: Any
|
||||
@ -506,41 +301,21 @@ class VectorStore(ABC):
|
||||
Raises:
|
||||
ValueError: If the number of IDs does not match the number of documents.
|
||||
"""
|
||||
# If either upsert or aupsert has been implemented, we delegate to them!
|
||||
if (
|
||||
type(self).aupsert != VectorStore.aupsert
|
||||
or type(self).upsert != VectorStore.upsert
|
||||
):
|
||||
# If aupsert has been implemented, we can use it to add documents
|
||||
from langchain_core.documents import Document
|
||||
# If the async method has been overridden, we'll use that.
|
||||
if type(self).aadd_texts != VectorStore.aadd_texts:
|
||||
if "ids" not in kwargs:
|
||||
ids = [doc.id for doc in documents]
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(documents):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of documents."
|
||||
"Got {len(ids)} ids and {len(documents)} documents."
|
||||
)
|
||||
# If there's at least one valid ID, we'll assume that IDs
|
||||
# should be used.
|
||||
if any(ids):
|
||||
kwargs["ids"] = ids
|
||||
|
||||
documents_ = []
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return await self.aadd_texts(texts, metadatas, **kwargs)
|
||||
|
||||
for id_, document in zip(ids, documents):
|
||||
doc_with_id = Document(
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
id=id_,
|
||||
)
|
||||
documents_.append(doc_with_id)
|
||||
else:
|
||||
documents_ = documents
|
||||
|
||||
# The default implementation of aupsert delegates to upsert.
|
||||
upsert_response = await self.aupsert(documents_, **kwargs)
|
||||
return upsert_response["succeeded"]
|
||||
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return await self.aadd_texts(texts, metadatas, **kwargs)
|
||||
return await run_in_executor(None, self.add_documents, documents, **kwargs)
|
||||
|
||||
def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]:
|
||||
"""Return docs most similar to query using a specified search type.
|
||||
|
@ -8,12 +8,14 @@ from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from langchain_core._api import deprecated
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.load import dumpd, load
|
||||
@ -56,43 +58,71 @@ class InMemoryVectorStore(VectorStore):
|
||||
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||
self.delete(ids)
|
||||
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
vectors = self.embedding.embed_documents([item.page_content for item in items])
|
||||
ids = []
|
||||
for item, vector in zip(items, vectors):
|
||||
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||
ids.append(doc_id)
|
||||
self.store[doc_id] = {
|
||||
"id": doc_id,
|
||||
"vector": vector,
|
||||
"text": item.page_content,
|
||||
"metadata": item.metadata,
|
||||
}
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
def add_documents(
|
||||
self,
|
||||
documents: List[Document],
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Add documents to the store."""
|
||||
texts = [doc.page_content for doc in documents]
|
||||
vectors = self.embedding.embed_documents(texts)
|
||||
|
||||
async def aupsert(
|
||||
self, items: Sequence[Document], /, **kwargs: Any
|
||||
) -> UpsertResponse:
|
||||
vectors = await self.embedding.aembed_documents(
|
||||
[item.page_content for item in items]
|
||||
if ids and len(ids) != len(texts):
|
||||
raise ValueError(
|
||||
f"ids must be the same length as texts. "
|
||||
f"Got {len(ids)} ids and {len(texts)} texts."
|
||||
)
|
||||
|
||||
id_iterator: Iterator[Optional[str]] = (
|
||||
iter(ids) if ids else iter(doc.id for doc in documents)
|
||||
)
|
||||
ids = []
|
||||
for item, vector in zip(items, vectors):
|
||||
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||
ids.append(doc_id)
|
||||
self.store[doc_id] = {
|
||||
"id": doc_id,
|
||||
|
||||
ids_ = []
|
||||
|
||||
for doc, vector in zip(documents, vectors):
|
||||
doc_id = next(id_iterator)
|
||||
doc_id_ = doc_id if doc_id else str(uuid.uuid4())
|
||||
ids_.append(doc_id_)
|
||||
self.store[doc_id_] = {
|
||||
"id": doc_id_,
|
||||
"vector": vector,
|
||||
"text": item.page_content,
|
||||
"metadata": item.metadata,
|
||||
"text": doc.page_content,
|
||||
"metadata": doc.metadata,
|
||||
}
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
return ids_
|
||||
|
||||
async def aadd_documents(
|
||||
self, documents: List[Document], ids: Optional[List[str]] = None, **kwargs: Any
|
||||
) -> List[str]:
|
||||
"""Add documents to the store."""
|
||||
texts = [doc.page_content for doc in documents]
|
||||
vectors = await self.embedding.aembed_documents(texts)
|
||||
|
||||
if ids and len(ids) != len(texts):
|
||||
raise ValueError(
|
||||
f"ids must be the same length as texts. "
|
||||
f"Got {len(ids)} ids and {len(texts)} texts."
|
||||
)
|
||||
|
||||
id_iterator: Iterator[Optional[str]] = (
|
||||
iter(ids) if ids else iter(doc.id for doc in documents)
|
||||
)
|
||||
ids_: List[str] = []
|
||||
|
||||
for doc, vector in zip(documents, vectors):
|
||||
doc_id = next(id_iterator)
|
||||
doc_id_ = doc_id if doc_id else str(uuid.uuid4())
|
||||
ids_.append(doc_id_)
|
||||
self.store[doc_id_] = {
|
||||
"id": doc_id_,
|
||||
"vector": vector,
|
||||
"text": doc.page_content,
|
||||
"metadata": doc.metadata,
|
||||
}
|
||||
|
||||
return ids_
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
"""Get documents by their ids.
|
||||
@ -117,6 +147,62 @@ class InMemoryVectorStore(VectorStore):
|
||||
)
|
||||
return documents
|
||||
|
||||
@deprecated(
|
||||
alternative="VectorStore.add_documents",
|
||||
message=(
|
||||
"This was a beta API that was added in 0.2.11. "
|
||||
"It'll be removed in 0.3.0."
|
||||
),
|
||||
since="0.2.29",
|
||||
removal="0.3.0",
|
||||
)
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
vectors = self.embedding.embed_documents([item.page_content for item in items])
|
||||
ids = []
|
||||
for item, vector in zip(items, vectors):
|
||||
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||
ids.append(doc_id)
|
||||
self.store[doc_id] = {
|
||||
"id": doc_id,
|
||||
"vector": vector,
|
||||
"text": item.page_content,
|
||||
"metadata": item.metadata,
|
||||
}
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
@deprecated(
|
||||
alternative="VectorStore.aadd_documents",
|
||||
message=(
|
||||
"This was a beta API that was added in 0.2.11. "
|
||||
"It'll be removed in 0.3.0."
|
||||
),
|
||||
since="0.2.29",
|
||||
removal="0.3.0",
|
||||
)
|
||||
async def aupsert(
|
||||
self, items: Sequence[Document], /, **kwargs: Any
|
||||
) -> UpsertResponse:
|
||||
vectors = await self.embedding.aembed_documents(
|
||||
[item.page_content for item in items]
|
||||
)
|
||||
ids = []
|
||||
for item, vector in zip(items, vectors):
|
||||
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||
ids.append(doc_id)
|
||||
self.store[doc_id] = {
|
||||
"id": doc_id,
|
||||
"vector": vector,
|
||||
"text": item.page_content,
|
||||
"metadata": item.metadata,
|
||||
}
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
"""Async get documents by their ids.
|
||||
|
||||
|
@ -1,69 +1,50 @@
|
||||
"""Set of tests that complement the standard tests for vectorstore.
|
||||
|
||||
These tests verify that the base abstraction does appropriate delegation to
|
||||
the relevant methods.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
|
||||
def test_custom_upsert_type() -> None:
|
||||
"""Test that we can override the signature of the upsert method
|
||||
of the VectorStore class without creating typing issues by violating
|
||||
the Liskov Substitution Principle.
|
||||
"""
|
||||
|
||||
class ByVector(TypedDict):
|
||||
document: Document
|
||||
vector: List[float]
|
||||
|
||||
class CustomVectorStore(VectorStore):
|
||||
def upsert(
|
||||
# This unit test verifies that the signature of the upsert method
|
||||
# specifically the items parameter can be overridden without
|
||||
# violating the Liskov Substitution Principle (and getting
|
||||
# typing errors).
|
||||
self,
|
||||
items: Union[Sequence[Document], Sequence[ByVector]],
|
||||
/,
|
||||
**kwargs: Any,
|
||||
) -> UpsertResponse:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class CustomSyncVectorStore(VectorStore):
|
||||
"""A vectorstore that only implements the synchronous methods."""
|
||||
class CustomAddTextsVectorstore(VectorStore):
|
||||
"""A vectorstore that only implements add texts."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.store: Dict[str, Document] = {}
|
||||
|
||||
def upsert(
|
||||
def add_texts(
|
||||
self,
|
||||
items: Sequence[Document],
|
||||
/,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
# One of the kwargs should be `ids` which is a list of ids
|
||||
# associated with the texts.
|
||||
# This is not yet enforced in the type signature for backwards compatibility
|
||||
# with existing implementations.
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> UpsertResponse:
|
||||
ids = []
|
||||
for item in items:
|
||||
if item.id is None:
|
||||
new_item = item.copy()
|
||||
id_: str = str(uuid.uuid4())
|
||||
new_item.id = id_
|
||||
else:
|
||||
id_ = item.id
|
||||
new_item = item
|
||||
) -> List[str]:
|
||||
if not isinstance(texts, list):
|
||||
texts = list(texts)
|
||||
ids_iter = iter(ids or [])
|
||||
|
||||
self.store[id_] = new_item
|
||||
ids.append(id_)
|
||||
ids_ = []
|
||||
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
metadatas_ = metadatas or [{} for _ in texts]
|
||||
|
||||
for text, metadata in zip(texts, metadatas_ or []):
|
||||
next_id = next(ids_iter, None)
|
||||
id_ = next_id or str(uuid.uuid4())
|
||||
self.store[id_] = Document(page_content=text, metadata=metadata, id=id_)
|
||||
ids_.append(id_)
|
||||
return ids_
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
return [self.store[id] for id in ids if id in self.store]
|
||||
@ -74,8 +55,8 @@ class CustomSyncVectorStore(VectorStore):
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> CustomSyncVectorStore:
|
||||
vectorstore = CustomSyncVectorStore()
|
||||
) -> CustomAddTextsVectorstore:
|
||||
vectorstore = CustomAddTextsVectorstore()
|
||||
vectorstore.add_texts(texts, metadatas=metadatas, **kwargs)
|
||||
return vectorstore
|
||||
|
||||
@ -85,30 +66,38 @@ class CustomSyncVectorStore(VectorStore):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def test_implement_upsert() -> None:
|
||||
def test_default_add_documents() -> None:
|
||||
"""Test that we can implement the upsert method of the CustomVectorStore
|
||||
class without violating the Liskov Substitution Principle.
|
||||
"""
|
||||
|
||||
store = CustomSyncVectorStore()
|
||||
store = CustomAddTextsVectorstore()
|
||||
|
||||
# Check upsert with id
|
||||
assert store.upsert([Document(id="1", page_content="hello")]) == {
|
||||
"succeeded": ["1"],
|
||||
"failed": [],
|
||||
}
|
||||
assert store.add_documents([Document(id="1", page_content="hello")]) == ["1"]
|
||||
|
||||
assert store.get_by_ids(["1"]) == [Document(id="1", page_content="hello")]
|
||||
|
||||
# Check upsert without id
|
||||
response = store.upsert([Document(page_content="world")])
|
||||
assert len(response["succeeded"]) == 1
|
||||
id_ = response["succeeded"][0]
|
||||
assert id_ is not None
|
||||
assert store.get_by_ids([id_]) == [Document(id=id_, page_content="world")]
|
||||
ids = store.add_documents([Document(page_content="world")])
|
||||
assert len(ids) == 1
|
||||
assert store.get_by_ids(ids) == [Document(id=ids[0], page_content="world")]
|
||||
|
||||
# Check that add_documents works
|
||||
assert store.add_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert store.add_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
def test_default_add_texts() -> None:
|
||||
store = CustomAddTextsVectorstore()
|
||||
# Check that default implementation of add_texts works
|
||||
assert store.add_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||
|
||||
assert store.get_by_ids(["3", "4"]) == [
|
||||
Document(id="3", page_content="hello"),
|
||||
Document(id="4", page_content="world"),
|
||||
@ -130,39 +119,37 @@ def test_implement_upsert() -> None:
|
||||
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||
]
|
||||
|
||||
# Check that add_documents works
|
||||
assert store.add_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert store.add_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
async def test_aupsert_delegation_to_upsert() -> None:
|
||||
"""Test delegation to the synchronous upsert method in async execution
|
||||
if async methods are not implemented.
|
||||
"""
|
||||
store = CustomSyncVectorStore()
|
||||
async def test_default_aadd_documents() -> None:
|
||||
"""Test delegation to the synchronous method."""
|
||||
store = CustomAddTextsVectorstore()
|
||||
|
||||
# Check upsert with id
|
||||
assert await store.aupsert([Document(id="1", page_content="hello")]) == {
|
||||
"succeeded": ["1"],
|
||||
"failed": [],
|
||||
}
|
||||
assert await store.aadd_documents([Document(id="1", page_content="hello")]) == ["1"]
|
||||
|
||||
assert await store.aget_by_ids(["1"]) == [Document(id="1", page_content="hello")]
|
||||
|
||||
# Check upsert without id
|
||||
response = await store.aupsert([Document(page_content="world")])
|
||||
assert len(response["succeeded"]) == 1
|
||||
id_ = response["succeeded"][0]
|
||||
assert id_ is not None
|
||||
assert await store.aget_by_ids([id_]) == [Document(id=id_, page_content="world")]
|
||||
ids = await store.aadd_documents([Document(page_content="world")])
|
||||
assert len(ids) == 1
|
||||
assert await store.aget_by_ids(ids) == [Document(id=ids[0], page_content="world")]
|
||||
|
||||
# Check that add_documents works
|
||||
assert await store.aadd_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert await store.aadd_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
async def test_default_aadd_texts() -> None:
|
||||
"""Test delegation to the synchronous method."""
|
||||
store = CustomAddTextsVectorstore()
|
||||
# Check that default implementation of add_texts works
|
||||
assert await store.aadd_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||
|
||||
assert await store.aget_by_ids(["3", "4"]) == [
|
||||
Document(id="3", page_content="hello"),
|
||||
Document(id="4", page_content="world"),
|
||||
@ -183,12 +170,3 @@ async def test_aupsert_delegation_to_upsert() -> None:
|
||||
Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
|
||||
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||
]
|
||||
|
||||
# Check that add_documents works
|
||||
assert await store.aadd_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert await store.aadd_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
@ -1,6 +1,5 @@
|
||||
"""Test suite to test vectostores."""
|
||||
|
||||
import inspect
|
||||
from abc import abstractmethod
|
||||
|
||||
import pytest
|
||||
@ -169,39 +168,31 @@ class ReadWriteTestSuite(BaseStandardTests):
|
||||
documents = vectorstore.get_by_ids(["1", "2", "3"])
|
||||
assert documents == []
|
||||
|
||||
def test_upsert_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Run upsert tests."""
|
||||
def test_add_documents_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Run add_documents tests."""
|
||||
documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
response = vectorstore.upsert(documents)
|
||||
ids = response["succeeded"]
|
||||
ids = vectorstore.add_documents(documents)
|
||||
assert vectorstore.get_by_ids(ids) == [
|
||||
Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||
]
|
||||
|
||||
def test_upsert_with_existing_ids(self, vectorstore: VectorStore) -> None:
|
||||
"""Test that upserting with existing IDs is idempotent."""
|
||||
def test_add_documents_with_existing_ids(self, vectorstore: VectorStore) -> None:
|
||||
"""Test that add_documentsing with existing IDs is idempotent."""
|
||||
documents = [
|
||||
Document(id="foo", page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
response = vectorstore.upsert(documents)
|
||||
ids = response["succeeded"]
|
||||
assert response["failed"] == []
|
||||
ids = vectorstore.add_documents(documents)
|
||||
assert "foo" in ids
|
||||
assert vectorstore.get_by_ids(ids) == [
|
||||
Document(page_content="foo", metadata={"id": 1}, id="foo"),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||
]
|
||||
|
||||
def test_upsert_documents_has_no_ids(self, vectorstore: VectorStore) -> None:
|
||||
"""Verify that there is not parameter called ids in upsert"""
|
||||
signature = inspect.signature(vectorstore.upsert)
|
||||
assert "ids" not in signature.parameters
|
||||
|
||||
|
||||
class AsyncReadWriteTestSuite(BaseStandardTests):
|
||||
"""Test suite for checking the **async** read-write API of a vectorstore.
|
||||
@ -359,35 +350,29 @@ class AsyncReadWriteTestSuite(BaseStandardTests):
|
||||
# This should not raise an exception
|
||||
assert await vectorstore.aget_by_ids(["1", "2", "3"]) == []
|
||||
|
||||
async def test_upsert_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Run upsert tests."""
|
||||
async def test_add_documents_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Run add_documents tests."""
|
||||
documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
response = await vectorstore.aupsert(documents)
|
||||
ids = response["succeeded"]
|
||||
ids = await vectorstore.aadd_documents(documents)
|
||||
assert await vectorstore.aget_by_ids(ids) == [
|
||||
Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||
]
|
||||
|
||||
async def test_upsert_with_existing_ids(self, vectorstore: VectorStore) -> None:
|
||||
"""Test that upserting with existing IDs is idempotent."""
|
||||
async def test_add_documents_with_existing_ids(
|
||||
self, vectorstore: VectorStore
|
||||
) -> None:
|
||||
"""Test that add_documentsing with existing IDs is idempotent."""
|
||||
documents = [
|
||||
Document(id="foo", page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
response = await vectorstore.aupsert(documents)
|
||||
ids = response["succeeded"]
|
||||
assert response["failed"] == []
|
||||
ids = await vectorstore.aadd_documents(documents)
|
||||
assert "foo" in ids
|
||||
assert await vectorstore.aget_by_ids(ids) == [
|
||||
Document(page_content="foo", metadata={"id": 1}, id="foo"),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||
]
|
||||
|
||||
async def test_upsert_documents_has_no_ids(self, vectorstore: VectorStore) -> None:
|
||||
"""Verify that there is not parameter called ids in upsert"""
|
||||
signature = inspect.signature(vectorstore.aupsert)
|
||||
assert "ids" not in signature.parameters
|
||||
|
Loading…
Reference in New Issue
Block a user