mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
core[minor]: add upsert, streaming_upsert, aupsert, astreaming_upsert methods to the VectorStore abstraction (#23774)
This PR rolls out part of the new proposed interface for vectorstores (https://github.com/langchain-ai/langchain/pull/23544) to existing store implementations. The PR makes the following changes: 1. Adds standard upsert, streaming_upsert, aupsert, astreaming_upsert methods to the vectorstore. 2. Updates `add_texts` and `aadd_texts` to be non required with a default implementation that delegates to `upsert` and `aupsert` if those have been implemented. The original `add_texts` and `aadd_texts` methods are problematic as they spread object specific information across document and **kwargs. (e.g., ids are not a part of the document) 3. Adds a default implementation to `add_documents` and `aadd_documents` that delegates to `upsert` and `aupsert` respectively. 4. Adds standard unit tests to verify that a given vectorstore implements a correct read/write API. A downside of this implementation is that it creates `upsert` with a very similar signature to `add_documents`. The reason for introducing `upsert` is to: * Remove any ambiguities about what information is allowed in `kwargs`. Specifically kwargs should only be used for information common to all indexed data. (e.g., indexing timeout). *Allow inheriting from an anticipated generalized interface for indexing that will allow indexing `BaseMedia` (i.e., allow making a vectorstore for images/audio etc.) `add_documents` can be deprecated in the future in favor of `upsert` to make sure that users have a single correct way of indexing content. --------- Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
parent
3c752238c5
commit
6f08e11d7c
@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tupl
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
from langchain_core.load import dumpd, load
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
@ -37,27 +38,41 @@ class InMemoryVectorStore(VectorStore):
|
||||
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||
self.delete(ids)
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[Sequence[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Add texts to the store."""
|
||||
vectors = self.embedding.embed_documents(list(texts))
|
||||
ids_ = []
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
doc_id = ids[i] if ids else str(uuid.uuid4())
|
||||
ids_.append(doc_id)
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
vectors = self.embedding.embed_documents([item.page_content for item in items])
|
||||
ids = []
|
||||
for item, vector in zip(items, vectors):
|
||||
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||
ids.append(doc_id)
|
||||
self.store[doc_id] = {
|
||||
"id": doc_id,
|
||||
"vector": vectors[i],
|
||||
"text": text,
|
||||
"metadata": metadatas[i] if metadatas else {},
|
||||
"vector": vector,
|
||||
"text": item.page_content,
|
||||
"metadata": item.metadata,
|
||||
}
|
||||
return ids_
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
"""Get documents by their ids."""
|
||||
documents = []
|
||||
|
||||
for doc_id in ids:
|
||||
doc = self.store.get(doc_id)
|
||||
if doc:
|
||||
documents.append(
|
||||
Document(
|
||||
id=doc["id"],
|
||||
page_content=doc["text"],
|
||||
metadata=doc["metadata"],
|
||||
)
|
||||
)
|
||||
return documents
|
||||
|
||||
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
return self.get_by_ids(ids)
|
||||
|
||||
async def aadd_texts(
|
||||
self,
|
||||
@ -80,7 +95,9 @@ class InMemoryVectorStore(VectorStore):
|
||||
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
||||
result.append(
|
||||
(
|
||||
Document(page_content=doc["text"], metadata=doc["metadata"]),
|
||||
Document(
|
||||
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
|
||||
),
|
||||
similarity,
|
||||
vector,
|
||||
)
|
||||
|
@ -1053,7 +1053,7 @@ class Milvus(VectorStore):
|
||||
pks = [item.get(self._primary_field) for item in query_result]
|
||||
return pks
|
||||
|
||||
def upsert(
|
||||
def upsert( # type: ignore[override]
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
documents: List[Document] | None = None,
|
||||
|
@ -1,4 +1,5 @@
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
@ -13,6 +14,11 @@ from tests.integration_tests.vectorstores.fake_embeddings import (
|
||||
)
|
||||
|
||||
|
||||
class AnyStr(str):
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
return isinstance(other, str)
|
||||
|
||||
|
||||
class TestInMemoryReadWriteTestSuite(ReadWriteTestSuite):
|
||||
@pytest.fixture
|
||||
def vectorstore(self) -> InMemoryVectorStore:
|
||||
@ -31,10 +37,13 @@ async def test_inmemory() -> None:
|
||||
["foo", "bar", "baz"], ConsistentFakeEmbeddings()
|
||||
)
|
||||
output = await store.asimilarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
assert output == [Document(page_content="foo", id=AnyStr())]
|
||||
|
||||
output = await store.asimilarity_search("bar", k=2)
|
||||
assert output == [Document(page_content="bar"), Document(page_content="baz")]
|
||||
assert output == [
|
||||
Document(page_content="bar", id=AnyStr()),
|
||||
Document(page_content="baz", id=AnyStr()),
|
||||
]
|
||||
|
||||
output2 = await store.asimilarity_search_with_score("bar", k=2)
|
||||
assert output2[0][1] > output2[1][1]
|
||||
@ -61,8 +70,8 @@ async def test_inmemory_mmr() -> None:
|
||||
"foo", k=10, lambda_mult=0.1
|
||||
)
|
||||
assert len(output) == len(texts)
|
||||
assert output[0] == Document(page_content="foo")
|
||||
assert output[1] == Document(page_content="foy")
|
||||
assert output[0] == Document(page_content="foo", id=AnyStr())
|
||||
assert output[1] == Document(page_content="foy", id=AnyStr())
|
||||
|
||||
|
||||
async def test_inmemory_dump_load(tmp_path: Path) -> None:
|
||||
@ -90,4 +99,4 @@ async def test_inmemory_filter() -> None:
|
||||
output = await store.asimilarity_search(
|
||||
"baz", filter=lambda doc: doc.metadata["id"] == 1
|
||||
)
|
||||
assert output == [Document(page_content="foo", metadata={"id": 1})]
|
||||
assert output == [Document(page_content="foo", metadata={"id": 1}, id=AnyStr())]
|
||||
|
@ -6,7 +6,11 @@ if it's unchanged.
|
||||
"""
|
||||
|
||||
from langchain_core.indexing.api import IndexingResult, aindex, index
|
||||
from langchain_core.indexing.base import InMemoryRecordManager, RecordManager
|
||||
from langchain_core.indexing.base import (
|
||||
InMemoryRecordManager,
|
||||
RecordManager,
|
||||
UpsertResponse,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"aindex",
|
||||
@ -14,4 +18,5 @@ __all__ = [
|
||||
"IndexingResult",
|
||||
"InMemoryRecordManager",
|
||||
"RecordManager",
|
||||
"UpsertResponse",
|
||||
]
|
||||
|
@ -421,3 +421,16 @@ class InMemoryRecordManager(RecordManager):
|
||||
keys: A list of keys to delete.
|
||||
"""
|
||||
self.delete_keys(keys)
|
||||
|
||||
|
||||
class UpsertResponse(TypedDict):
|
||||
"""A generic response for upsert operations.
|
||||
|
||||
The upsert response will be used by abstractions that implement an upsert
|
||||
operation for content that can be upserted by ID.
|
||||
"""
|
||||
|
||||
succeeded: List[str]
|
||||
"""The IDs that were successfully indexed."""
|
||||
failed: List[str]
|
||||
"""The IDs that failed to index."""
|
||||
|
@ -5,6 +5,7 @@ These functions do not depend on any other LangChain module.
|
||||
"""
|
||||
|
||||
from langchain_core.utils import image
|
||||
from langchain_core.utils.aiter import abatch_iterate
|
||||
from langchain_core.utils.env import get_from_dict_or_env, get_from_env
|
||||
from langchain_core.utils.formatting import StrictFormatter, formatter
|
||||
from langchain_core.utils.input import (
|
||||
@ -13,6 +14,7 @@ from langchain_core.utils.input import (
|
||||
get_colored_text,
|
||||
print_text,
|
||||
)
|
||||
from langchain_core.utils.iter import batch_iterate
|
||||
from langchain_core.utils.loading import try_load_from_hub
|
||||
from langchain_core.utils.strings import comma_list, stringify_dict, stringify_value
|
||||
from langchain_core.utils.utils import (
|
||||
@ -48,4 +50,6 @@ __all__ = [
|
||||
"stringify_dict",
|
||||
"comma_list",
|
||||
"stringify_value",
|
||||
"batch_iterate",
|
||||
"abatch_iterate",
|
||||
]
|
||||
|
@ -11,6 +11,7 @@ from typing import (
|
||||
Any,
|
||||
AsyncContextManager,
|
||||
AsyncGenerator,
|
||||
AsyncIterable,
|
||||
AsyncIterator,
|
||||
Awaitable,
|
||||
Callable,
|
||||
@ -245,3 +246,28 @@ class aclosing(AbstractAsyncContextManager):
|
||||
) -> None:
|
||||
if hasattr(self.thing, "aclose"):
|
||||
await self.thing.aclose()
|
||||
|
||||
|
||||
async def abatch_iterate(
|
||||
size: int, iterable: AsyncIterable[T]
|
||||
) -> AsyncIterator[List[T]]:
|
||||
"""Utility batching function for async iterables.
|
||||
|
||||
Args:
|
||||
size: The size of the batch.
|
||||
iterable: The async iterable to batch.
|
||||
|
||||
Returns:
|
||||
An async iterator over the batches
|
||||
"""
|
||||
batch: List[T] = []
|
||||
async for element in iterable:
|
||||
if len(batch) < size:
|
||||
batch.append(element)
|
||||
|
||||
if len(batch) >= size:
|
||||
yield batch
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
yield batch
|
||||
|
@ -25,26 +25,34 @@ import logging
|
||||
import math
|
||||
import warnings
|
||||
from abc import ABC, abstractmethod
|
||||
from itertools import cycle
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
AsyncIterable,
|
||||
AsyncIterator,
|
||||
Callable,
|
||||
ClassVar,
|
||||
Collection,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.pydantic_v1 import Field, root_validator
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
from langchain_core.runnables.config import run_in_executor
|
||||
from langchain_core.utils.aiter import abatch_iterate
|
||||
from langchain_core.utils.iter import batch_iterate
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.callbacks.manager import (
|
||||
@ -52,6 +60,7 @@ if TYPE_CHECKING:
|
||||
CallbackManagerForRetrieverRun,
|
||||
)
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.indexing.base import UpsertResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -61,11 +70,14 @@ VST = TypeVar("VST", bound="VectorStore")
|
||||
class VectorStore(ABC):
|
||||
"""Interface for vector store."""
|
||||
|
||||
@abstractmethod
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
# One of the kwargs should be `ids` which is a list of ids
|
||||
# associated with the texts.
|
||||
# This is not yet enforced in the type signature for backwards compatibility
|
||||
# with existing implementations.
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
@ -74,16 +86,205 @@ class VectorStore(ABC):
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
**kwargs: vectorstore specific parameters.
|
||||
One of the kwargs should be `ids` which is a list of ids
|
||||
associated with the texts.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
if type(self).upsert != VectorStore.upsert:
|
||||
# Import document in local scope to avoid circular imports
|
||||
from langchain_core.documents import Document
|
||||
|
||||
# This condition is triggered if the subclass has provided
|
||||
# an implementation of the upsert method.
|
||||
# The existing add_texts
|
||||
texts_: Sequence[str] = (
|
||||
texts if isinstance(texts, (list, tuple)) else list(texts)
|
||||
)
|
||||
if metadatas and len(metadatas) != len(texts_):
|
||||
raise ValueError(
|
||||
"The number of metadatas must match the number of texts."
|
||||
"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||
)
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(texts_):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of texts."
|
||||
"Got {len(ids)} ids and {len(texts_)} texts."
|
||||
)
|
||||
else:
|
||||
ids = None
|
||||
|
||||
metadatas_ = iter(metadatas) if metadatas else cycle([{}])
|
||||
ids_: Iterable[Union[str, None]] = ids if ids is not None else cycle([None])
|
||||
docs = [
|
||||
Document(page_content=text, metadata=metadata_, id=id_)
|
||||
for text, metadata_, id_ in zip(texts, metadatas_, ids_)
|
||||
]
|
||||
upsert_response = self.upsert(docs, **kwargs)
|
||||
return upsert_response["succeeded"]
|
||||
raise NotImplementedError(
|
||||
f"`add_texts` has not been implemented for {self.__class__.__name__} "
|
||||
)
|
||||
|
||||
# Developer guidelines:
|
||||
# Do not override streaming_upsert!
|
||||
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||
def streaming_upsert(
|
||||
self, items: Iterable[Document], /, batch_size: int, **kwargs: Any
|
||||
) -> Iterator[UpsertResponse]:
|
||||
"""Upsert documents in a streaming fashion.
|
||||
|
||||
Args:
|
||||
items: Iterable of Documents to add to the vectorstore.
|
||||
batch_size: The size of each batch to upsert.
|
||||
**kwargs: Additional keyword arguments.
|
||||
kwargs should only include parameters that are common to all
|
||||
documents. (e.g., timeout for indexing, retry policy, etc.)
|
||||
kwargs should not include ids to avoid ambiguous semantics.
|
||||
Instead the ID should be provided as part of the Document object.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
# The default implementation of this method breaks the input into
|
||||
# batches of size `batch_size` and calls the `upsert` method on each batch.
|
||||
# Subclasses can override this method to provide a more efficient
|
||||
# implementation.
|
||||
for item_batch in batch_iterate(batch_size, items):
|
||||
yield self.upsert(item_batch, **kwargs)
|
||||
|
||||
# Please note that we've added a new method `upsert` instead of re-using the
|
||||
# existing `add_documents` method.
|
||||
# This was done to resolve potential ambiguities around the behavior of **kwargs
|
||||
# in existing add_documents / add_texts methods which could include per document
|
||||
# information (e.g., the `ids` parameter).
|
||||
# Over time the `add_documents` could be denoted as legacy and deprecated
|
||||
# in favor of the `upsert` method.
|
||||
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
"""Add or update documents in the vectorstore.
|
||||
|
||||
The upsert functionality should utilize the ID field of the Document object
|
||||
if it is provided. If the ID is not provided, the upsert method is free
|
||||
to generate an ID for the document.
|
||||
|
||||
When an ID is specified and the document already exists in the vectorstore,
|
||||
the upsert method should update the document with the new data. If the document
|
||||
does not exist, the upsert method should add the document to the vectorstore.
|
||||
|
||||
Args:
|
||||
items: Sequence of Documents to add to the vectorstore.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
UpsertResponse: A response object that contains the list of IDs that were
|
||||
successfully added or updated in the vectorstore and the list of IDs that
|
||||
failed to be added or updated.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
# Developer guidelines:
|
||||
#
|
||||
# Vectorstores implementations are free to extend `upsert` implementation
|
||||
# to take in additional data per document.
|
||||
#
|
||||
# This data **SHOULD NOT** be part of the **kwargs** parameter, instead
|
||||
# sub-classes can use a Union type on `documents` to include additional
|
||||
# supported formats for the input data stream.
|
||||
#
|
||||
# For example,
|
||||
#
|
||||
# .. code-block:: python
|
||||
# from typing import TypedDict
|
||||
#
|
||||
# class DocumentWithVector(TypedDict):
|
||||
# document: Document
|
||||
# vector: List[float]
|
||||
#
|
||||
# def upsert(
|
||||
# self,
|
||||
# documents: Union[Iterable[Document], Iterable[DocumentWithVector]],
|
||||
# /,
|
||||
# **kwargs
|
||||
# ) -> UpsertResponse:
|
||||
# \"\"\"Add or update documents in the vectorstore.\"\"\"
|
||||
# # Implementation should check if documents is an
|
||||
# # iterable of DocumentWithVector or Document
|
||||
# pass
|
||||
#
|
||||
# Implementations that override upsert should include a new doc-string
|
||||
# that explains the semantics of upsert and includes in code
|
||||
# examples of how to insert using the alternate data formats.
|
||||
|
||||
# The implementation does not delegate to the `add_texts` method or
|
||||
# the `add_documents` method by default since those implementations
|
||||
raise NotImplementedError(
|
||||
f"upsert has not been implemented for {self.__class__.__name__}"
|
||||
)
|
||||
|
||||
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||
async def astreaming_upsert(
|
||||
self,
|
||||
items: AsyncIterable[Document],
|
||||
/,
|
||||
batch_size: int,
|
||||
**kwargs: Any,
|
||||
) -> AsyncIterator[UpsertResponse]:
|
||||
"""Upsert documents in a streaming fashion. Async version of streaming_upsert.
|
||||
|
||||
Args:
|
||||
items: Iterable of Documents to add to the vectorstore.
|
||||
batch_size: The size of each batch to upsert.
|
||||
**kwargs: Additional keyword arguments.
|
||||
kwargs should only include parameters that are common to all
|
||||
documents. (e.g., timeout for indexing, retry policy, etc.)
|
||||
kwargs should not include ids to avoid ambiguous semantics.
|
||||
Instead the ID should be provided as part of the Document object.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
async for batch in abatch_iterate(batch_size, items):
|
||||
yield await self.aupsert(batch, **kwargs)
|
||||
|
||||
@beta(message="Added in 0.2.11. The API is subject to change.")
|
||||
async def aupsert(
|
||||
self, items: Sequence[Document], /, **kwargs: Any
|
||||
) -> UpsertResponse:
|
||||
"""Add or update documents in the vectorstore. Async version of upsert.
|
||||
|
||||
The upsert functionality should utilize the ID field of the Document object
|
||||
if it is provided. If the ID is not provided, the upsert method is free
|
||||
to generate an ID for the document.
|
||||
|
||||
When an ID is specified and the document already exists in the vectorstore,
|
||||
the upsert method should update the document with the new data. If the document
|
||||
does not exist, the upsert method should add the document to the vectorstore.
|
||||
|
||||
Args:
|
||||
items: Sequence of Documents to add to the vectorstore.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
UpsertResponse: A response object that contains the list of IDs that were
|
||||
successfully added or updated in the vectorstore and the list of IDs that
|
||||
failed to be added or updated.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
# Developer guidelines: See guidelines for the `upsert` method.
|
||||
# The implementation does not delegate to the `add_texts` method or
|
||||
# the `add_documents` method by default since those implementations
|
||||
return await run_in_executor(None, self.upsert, items, **kwargs)
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Optional[Embeddings]:
|
||||
"""Access the query embedding object if available."""
|
||||
logger.debug(
|
||||
f"{Embeddings.__name__} is not implemented for {self.__class__.__name__}"
|
||||
f"The embeddings property has not been "
|
||||
f"implemented for {self.__class__.__name__}"
|
||||
)
|
||||
return None
|
||||
|
||||
@ -187,17 +388,81 @@ class VectorStore(ABC):
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
if type(self).aupsert != VectorStore.aupsert:
|
||||
# Import document in local scope to avoid circular imports
|
||||
from langchain_core.documents import Document
|
||||
|
||||
# This condition is triggered if the subclass has provided
|
||||
# an implementation of the upsert method.
|
||||
# The existing add_texts
|
||||
texts_: Sequence[str] = (
|
||||
texts if isinstance(texts, (list, tuple)) else list(texts)
|
||||
)
|
||||
if metadatas and len(metadatas) != len(texts_):
|
||||
raise ValueError(
|
||||
"The number of metadatas must match the number of texts."
|
||||
"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||
)
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(texts_):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of texts."
|
||||
"Got {len(ids)} ids and {len(texts_)} texts."
|
||||
)
|
||||
else:
|
||||
ids = None
|
||||
|
||||
metadatas_ = iter(metadatas) if metadatas else cycle([{}])
|
||||
ids_: Iterable[Union[str, None]] = ids if ids is not None else cycle([None])
|
||||
docs = [
|
||||
Document(page_content=text, metadata=metadata_, id=id_)
|
||||
for text, metadata_, id_ in zip(texts, metadatas_, ids_)
|
||||
]
|
||||
upsert_response = await self.aupsert(docs, **kwargs)
|
||||
return upsert_response["succeeded"]
|
||||
return await run_in_executor(None, self.add_texts, texts, metadatas, **kwargs)
|
||||
|
||||
def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]:
|
||||
"""Run more documents through the embeddings and add to the vectorstore.
|
||||
"""Add or update documents in the vectorstore.
|
||||
|
||||
Args:
|
||||
documents: Documents to add to the vectorstore.
|
||||
kwargs: Additional keyword arguments.
|
||||
if kwargs contains ids and documents contain ids,
|
||||
the ids in the kwargs will receive precedence.
|
||||
|
||||
Returns:
|
||||
List of IDs of the added texts.
|
||||
"""
|
||||
if type(self).upsert != VectorStore.upsert:
|
||||
from langchain_core.documents import Document
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(documents):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of documents. "
|
||||
"Got {len(ids)} ids and {len(documents)} documents."
|
||||
)
|
||||
|
||||
documents_ = []
|
||||
|
||||
for id_, document in zip(ids, documents):
|
||||
doc_with_id = Document(
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
id=id_,
|
||||
)
|
||||
documents_.append(doc_with_id)
|
||||
else:
|
||||
documents_ = documents
|
||||
|
||||
# If upsert has been implemented, we can use it to add documents
|
||||
return self.upsert(documents_, **kwargs)["succeeded"]
|
||||
|
||||
# Code path that delegates to add_text for backwards compatibility
|
||||
# TODO: Handle the case where the user doesn't provide ids on the Collection
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
@ -214,6 +479,38 @@ class VectorStore(ABC):
|
||||
Returns:
|
||||
List of IDs of the added texts.
|
||||
"""
|
||||
# If either upsert or aupsert has been implemented, we delegate to them!
|
||||
if (
|
||||
type(self).aupsert != VectorStore.aupsert
|
||||
or type(self).upsert != VectorStore.upsert
|
||||
):
|
||||
# If aupsert has been implemented, we can use it to add documents
|
||||
from langchain_core.documents import Document
|
||||
|
||||
if "ids" in kwargs:
|
||||
ids = kwargs.pop("ids")
|
||||
if ids and len(ids) != len(documents):
|
||||
raise ValueError(
|
||||
"The number of ids must match the number of documents."
|
||||
"Got {len(ids)} ids and {len(documents)} documents."
|
||||
)
|
||||
|
||||
documents_ = []
|
||||
|
||||
for id_, document in zip(ids, documents):
|
||||
doc_with_id = Document(
|
||||
page_content=document.page_content,
|
||||
metadata=document.metadata,
|
||||
id=id_,
|
||||
)
|
||||
documents_.append(doc_with_id)
|
||||
else:
|
||||
documents_ = documents
|
||||
|
||||
# If upsert has been implemented, we can use it to add documents
|
||||
upsert_response = await self.aupsert(documents_, **kwargs)
|
||||
return upsert_response["succeeded"]
|
||||
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return await self.aadd_texts(texts, metadatas, **kwargs)
|
||||
|
@ -10,4 +10,5 @@ def test_all() -> None:
|
||||
"IndexingResult",
|
||||
"InMemoryRecordManager",
|
||||
"RecordManager",
|
||||
"UpsertResponse",
|
||||
]
|
||||
|
31
libs/core/tests/unit_tests/utils/test_aiter.py
Normal file
31
libs/core/tests/unit_tests/utils/test_aiter.py
Normal file
@ -0,0 +1,31 @@
|
||||
from typing import AsyncIterator, List
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_core.utils.aiter import abatch_iterate
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_size, input_iterable, expected_output",
|
||||
[
|
||||
(2, [1, 2, 3, 4, 5], [[1, 2], [3, 4], [5]]),
|
||||
(3, [10, 20, 30, 40, 50], [[10, 20, 30], [40, 50]]),
|
||||
(1, [100, 200, 300], [[100], [200], [300]]),
|
||||
(4, [], []),
|
||||
],
|
||||
)
|
||||
async def test_abatch_iterate(
|
||||
input_size: int, input_iterable: List[str], expected_output: List[str]
|
||||
) -> None:
|
||||
"""Test batching function."""
|
||||
|
||||
async def _to_async_iterable(iterable: List[str]) -> AsyncIterator[str]:
|
||||
for item in iterable:
|
||||
yield item
|
||||
|
||||
iterator_ = abatch_iterate(input_size, _to_async_iterable(input_iterable))
|
||||
|
||||
assert isinstance(iterator_, AsyncIterator)
|
||||
|
||||
output = [el async for el in iterator_]
|
||||
assert output == expected_output
|
@ -6,6 +6,8 @@ EXPECTED_ALL = [
|
||||
"convert_to_secret_str",
|
||||
"formatter",
|
||||
"get_bolded_text",
|
||||
"abatch_iterate",
|
||||
"batch_iterate",
|
||||
"get_color_mapping",
|
||||
"get_colored_text",
|
||||
"get_pydantic_field_names",
|
||||
|
0
libs/core/tests/unit_tests/vectorstores/__init__.py
Normal file
0
libs/core/tests/unit_tests/vectorstores/__init__.py
Normal file
194
libs/core/tests/unit_tests/vectorstores/test_vectorstore.py
Normal file
194
libs/core/tests/unit_tests/vectorstores/test_vectorstore.py
Normal file
@ -0,0 +1,194 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.indexing.base import UpsertResponse
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
|
||||
def test_custom_upsert_type() -> None:
|
||||
"""Test that we can override the signature of the upsert method
|
||||
of the VectorStore class without creating typing issues by violating
|
||||
the Liskov Substitution Principle.
|
||||
"""
|
||||
|
||||
class ByVector(TypedDict):
|
||||
document: Document
|
||||
vector: List[float]
|
||||
|
||||
class CustomVectorStore(VectorStore):
|
||||
def upsert(
|
||||
# This unit test verifies that the signature of the upsert method
|
||||
# specifically the items parameter can be overridden without
|
||||
# violating the Liskov Substitution Principle (and getting
|
||||
# typing errors).
|
||||
self,
|
||||
items: Union[Sequence[Document], Sequence[ByVector]],
|
||||
/,
|
||||
**kwargs: Any,
|
||||
) -> UpsertResponse:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class CustomSyncVectorStore(VectorStore):
|
||||
"""A vectorstore that only implements the synchronous methods."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.store: Dict[str, Document] = {}
|
||||
|
||||
def upsert(
|
||||
self,
|
||||
items: Sequence[Document],
|
||||
/,
|
||||
**kwargs: Any,
|
||||
) -> UpsertResponse:
|
||||
ids = []
|
||||
for item in items:
|
||||
if item.id is None:
|
||||
new_item = item.copy()
|
||||
id_: str = str(uuid.uuid4())
|
||||
new_item.id = id_
|
||||
else:
|
||||
id_ = item.id
|
||||
new_item = item
|
||||
|
||||
self.store[id_] = new_item
|
||||
ids.append(id_)
|
||||
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
return [self.store[id] for id in ids if id in self.store]
|
||||
|
||||
def from_texts( # type: ignore
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> CustomSyncVectorStore:
|
||||
vectorstore = CustomSyncVectorStore()
|
||||
vectorstore.add_texts(texts, metadatas=metadatas, **kwargs)
|
||||
return vectorstore
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def test_implement_upsert() -> None:
|
||||
"""Test that we can implement the upsert method of the CustomVectorStore
|
||||
class without violating the Liskov Substitution Principle.
|
||||
"""
|
||||
|
||||
store = CustomSyncVectorStore()
|
||||
|
||||
# Check upsert with id
|
||||
assert store.upsert([Document(id="1", page_content="hello")]) == {
|
||||
"succeeded": ["1"],
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
assert store.get_by_ids(["1"]) == [Document(id="1", page_content="hello")]
|
||||
|
||||
# Check upsert without id
|
||||
response = store.upsert([Document(page_content="world")])
|
||||
assert len(response["succeeded"]) == 1
|
||||
id_ = response["succeeded"][0]
|
||||
assert id_ is not None
|
||||
assert store.get_by_ids([id_]) == [Document(id=id_, page_content="world")]
|
||||
|
||||
# Check that default implementation of add_texts works
|
||||
assert store.add_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||
assert store.get_by_ids(["3", "4"]) == [
|
||||
Document(id="3", page_content="hello"),
|
||||
Document(id="4", page_content="world"),
|
||||
]
|
||||
|
||||
# Add texts without ids
|
||||
ids_ = store.add_texts(["foo", "bar"])
|
||||
assert len(ids_) == 2
|
||||
assert store.get_by_ids(ids_) == [
|
||||
Document(id=ids_[0], page_content="foo"),
|
||||
Document(id=ids_[1], page_content="bar"),
|
||||
]
|
||||
|
||||
# Add texts with metadatas
|
||||
ids_2 = store.add_texts(["foo", "bar"], metadatas=[{"foo": "bar"}] * 2)
|
||||
assert len(ids_2) == 2
|
||||
assert store.get_by_ids(ids_2) == [
|
||||
Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
|
||||
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||
]
|
||||
|
||||
# Check that add_documents works
|
||||
assert store.add_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert store.add_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
async def test_aupsert_delegation_to_upsert() -> None:
|
||||
"""Test delegation to the synchronous upsert method in async execution
|
||||
if async methods are not implemented.
|
||||
"""
|
||||
store = CustomSyncVectorStore()
|
||||
|
||||
# Check upsert with id
|
||||
assert await store.aupsert([Document(id="1", page_content="hello")]) == {
|
||||
"succeeded": ["1"],
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
assert await store.aget_by_ids(["1"]) == [Document(id="1", page_content="hello")]
|
||||
|
||||
# Check upsert without id
|
||||
response = await store.aupsert([Document(page_content="world")])
|
||||
assert len(response["succeeded"]) == 1
|
||||
id_ = response["succeeded"][0]
|
||||
assert id_ is not None
|
||||
assert await store.aget_by_ids([id_]) == [Document(id=id_, page_content="world")]
|
||||
|
||||
# Check that default implementation of add_texts works
|
||||
assert await store.aadd_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||
assert await store.aget_by_ids(["3", "4"]) == [
|
||||
Document(id="3", page_content="hello"),
|
||||
Document(id="4", page_content="world"),
|
||||
]
|
||||
|
||||
# Add texts without ids
|
||||
ids_ = await store.aadd_texts(["foo", "bar"])
|
||||
assert len(ids_) == 2
|
||||
assert await store.aget_by_ids(ids_) == [
|
||||
Document(id=ids_[0], page_content="foo"),
|
||||
Document(id=ids_[1], page_content="bar"),
|
||||
]
|
||||
|
||||
# Add texts with metadatas
|
||||
ids_2 = await store.aadd_texts(["foo", "bar"], metadatas=[{"foo": "bar"}] * 2)
|
||||
assert len(ids_2) == 2
|
||||
assert await store.aget_by_ids(ids_2) == [
|
||||
Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
|
||||
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||
]
|
||||
|
||||
# Check that add_documents works
|
||||
assert await store.aadd_documents([Document(id="5", page_content="baz")]) == ["5"]
|
||||
|
||||
# Test add documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
assert await store.aadd_documents([original_document], ids=["6"]) == ["6"]
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
@ -46,15 +46,21 @@ class ReadWriteTestSuite(ABC):
|
||||
|
||||
def test_add_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Test adding documents into the vectorstore."""
|
||||
documents = [
|
||||
original_documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
vectorstore.add_documents(documents)
|
||||
ids = vectorstore.add_documents(original_documents)
|
||||
documents = vectorstore.similarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||
Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
|
||||
]
|
||||
# Verify that the original document object does not get mutated!
|
||||
# (e.g., an ID is added to the original document object)
|
||||
assert original_documents == [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
|
||||
@ -71,10 +77,11 @@ class ReadWriteTestSuite(ABC):
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
vectorstore.add_documents(documents, ids=["1", "2"])
|
||||
ids = vectorstore.add_documents(documents, ids=["1", "2"])
|
||||
assert ids == ["1", "2"]
|
||||
vectorstore.delete(["1"])
|
||||
documents = vectorstore.similarity_search("foo", k=1)
|
||||
assert documents == [Document(page_content="bar", metadata={"id": 2})]
|
||||
assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")]
|
||||
|
||||
def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Test that we can delete several documents at once."""
|
||||
@ -87,7 +94,7 @@ class ReadWriteTestSuite(ABC):
|
||||
vectorstore.add_documents(documents, ids=["1", "2", "3"])
|
||||
vectorstore.delete(["1", "2"])
|
||||
documents = vectorstore.similarity_search("foo", k=1)
|
||||
assert documents == [Document(page_content="baz", metadata={"id": 3})]
|
||||
assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")]
|
||||
|
||||
def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
|
||||
"""Deleting missing content should not raise an exception."""
|
||||
@ -106,25 +113,8 @@ class ReadWriteTestSuite(ABC):
|
||||
vectorstore.add_documents(documents, ids=["1", "2"])
|
||||
documents = vectorstore.similarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
]
|
||||
|
||||
def test_add_documents_without_ids_gets_duplicated(
|
||||
self, vectorstore: VectorStore
|
||||
) -> None:
|
||||
"""Adding documents without specifying IDs should duplicate content."""
|
||||
documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
vectorstore.add_documents(documents)
|
||||
vectorstore.add_documents(documents)
|
||||
documents = vectorstore.similarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
||||
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
||||
]
|
||||
|
||||
def test_add_documents_by_id_with_mutation(self, vectorstore: VectorStore) -> None:
|
||||
@ -149,9 +139,11 @@ class ReadWriteTestSuite(ABC):
|
||||
documents = vectorstore.similarity_search("new foo", k=2)
|
||||
assert documents == [
|
||||
Document(
|
||||
page_content="new foo", metadata={"id": 1, "some_other_field": "foo"}
|
||||
id="1",
|
||||
page_content="new foo",
|
||||
metadata={"id": 1, "some_other_field": "foo"},
|
||||
),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
|
||||
@ -190,15 +182,22 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
|
||||
async def test_add_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Test adding documents into the vectorstore."""
|
||||
documents = [
|
||||
original_documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
await vectorstore.aadd_documents(documents)
|
||||
ids = await vectorstore.aadd_documents(original_documents)
|
||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}, id=ids[1]),
|
||||
Document(page_content="foo", metadata={"id": 1}, id=ids[0]),
|
||||
]
|
||||
|
||||
# Verify that the original document object does not get mutated!
|
||||
# (e.g., an ID is added to the original document object)
|
||||
assert original_documents == [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
async def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None:
|
||||
@ -215,10 +214,11 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
||||
ids = await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
||||
assert ids == ["1", "2"]
|
||||
await vectorstore.adelete(["1"])
|
||||
documents = await vectorstore.asimilarity_search("foo", k=1)
|
||||
assert documents == [Document(page_content="bar", metadata={"id": 2})]
|
||||
assert documents == [Document(page_content="bar", metadata={"id": 2}, id="2")]
|
||||
|
||||
async def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None:
|
||||
"""Test that we can delete several documents at once."""
|
||||
@ -231,7 +231,7 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
await vectorstore.aadd_documents(documents, ids=["1", "2", "3"])
|
||||
await vectorstore.adelete(["1", "2"])
|
||||
documents = await vectorstore.asimilarity_search("foo", k=1)
|
||||
assert documents == [Document(page_content="baz", metadata={"id": 3})]
|
||||
assert documents == [Document(page_content="baz", metadata={"id": 3}, id="3")]
|
||||
|
||||
async def test_delete_missing_content(self, vectorstore: VectorStore) -> None:
|
||||
"""Deleting missing content should not raise an exception."""
|
||||
@ -250,25 +250,8 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
await vectorstore.aadd_documents(documents, ids=["1", "2"])
|
||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
]
|
||||
|
||||
async def test_add_documents_without_ids_gets_duplicated(
|
||||
self, vectorstore: VectorStore
|
||||
) -> None:
|
||||
"""Adding documents without specifying IDs should duplicate content."""
|
||||
documents = [
|
||||
Document(page_content="foo", metadata={"id": 1}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
||||
await vectorstore.aadd_documents(documents)
|
||||
await vectorstore.aadd_documents(documents)
|
||||
documents = await vectorstore.asimilarity_search("bar", k=2)
|
||||
assert documents == [
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(page_content="bar", metadata={"id": 2}, id="2"),
|
||||
Document(page_content="foo", metadata={"id": 1}, id="1"),
|
||||
]
|
||||
|
||||
async def test_add_documents_by_id_with_mutation(
|
||||
@ -295,7 +278,9 @@ class AsyncReadWriteTestSuite(ABC):
|
||||
documents = await vectorstore.asimilarity_search("new foo", k=2)
|
||||
assert documents == [
|
||||
Document(
|
||||
page_content="new foo", metadata={"id": 1, "some_other_field": "foo"}
|
||||
id="1",
|
||||
page_content="new foo",
|
||||
metadata={"id": 1, "some_other_field": "foo"},
|
||||
),
|
||||
Document(page_content="bar", metadata={"id": 2}),
|
||||
Document(id="2", page_content="bar", metadata={"id": 2}),
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user