mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
community : [bugfix] Use document ids as keys in AzureSearch vectorstore (#25486)
# Description [Vector store base class](4cdaca67dc/libs/core/langchain_core/vectorstores/base.py (L65)
) currently expects `ids` to be passed in and that is what it passes along to the AzureSearch vector store when attempting to `add_texts()`. However AzureSearch expects `keys` to be passed in. When they are not present, AzureSearch `add_embeddings()` makes up new uuids. This is a problem when trying to run indexing. [Indexing code expects](b297af5482/libs/core/langchain_core/indexing/api.py (L371)
) the documents to be uploaded using provided ids. Currently AzureSearch ignores `ids` passed from `indexing` and makes up new ones. Later when `indexer` attempts to delete removed file, it uses the `id` it had stored when uploading the document, however it was uploaded under different `id`. **Twitter handle: @martintriska1**
This commit is contained in:
parent
a8561bc303
commit
3fc0ea510e
@ -443,6 +443,12 @@ class AzureSearch(VectorStore):
|
||||
logger.debug("Nothing to insert, skipping.")
|
||||
return []
|
||||
|
||||
# when `keys` are not passed in and there is `ids` in kwargs, use those instead
|
||||
# base class expects `ids` passed in rather than `keys`
|
||||
# https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65
|
||||
if (not keys) and ("ids" in kwargs) and (len(kwargs["ids"]) == len(embeddings)):
|
||||
keys = kwargs["ids"]
|
||||
|
||||
return self.add_embeddings(zip(texts, embeddings), metadatas, keys=keys)
|
||||
|
||||
async def aadd_texts(
|
||||
@ -467,6 +473,12 @@ class AzureSearch(VectorStore):
|
||||
logger.debug("Nothing to insert, skipping.")
|
||||
return []
|
||||
|
||||
# when `keys` are not passed in and there is `ids` in kwargs, use those instead
|
||||
# base class expects `ids` passed in rather than `keys`
|
||||
# https://github.com/langchain-ai/langchain/blob/4cdaca67dc51dba887289f56c6fead3c1a52f97d/libs/core/langchain_core/vectorstores/base.py#L65
|
||||
if (not keys) and ("ids" in kwargs) and (len(kwargs["ids"]) == len(embeddings)):
|
||||
keys = kwargs["ids"]
|
||||
|
||||
return await self.aadd_embeddings(zip(texts, embeddings), metadatas, keys=keys)
|
||||
|
||||
def add_embeddings(
|
||||
@ -483,9 +495,13 @@ class AzureSearch(VectorStore):
|
||||
data = []
|
||||
for i, (text, embedding) in enumerate(text_embeddings):
|
||||
# Use provided key otherwise use default key
|
||||
key = keys[i] if keys else str(uuid.uuid4())
|
||||
# Encoding key for Azure Search valid characters
|
||||
key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii")
|
||||
if keys:
|
||||
key = keys[i]
|
||||
else:
|
||||
key = str(uuid.uuid4())
|
||||
# Encoding key for Azure Search valid characters
|
||||
key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii")
|
||||
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
# Add data to index
|
||||
# Additional metadata to fields mapping
|
||||
|
@ -190,3 +190,40 @@ def test_additional_search_options() -> None:
|
||||
)
|
||||
assert vector_store.client is not None
|
||||
assert vector_store.client._api_version == "test"
|
||||
|
||||
|
||||
@pytest.mark.requires("azure.search.documents")
|
||||
def test_ids_used_correctly() -> None:
|
||||
"""Check whether vector store uses the document ids when provided with them."""
|
||||
from azure.search.documents import SearchClient
|
||||
from azure.search.documents.indexes import SearchIndexClient
|
||||
from langchain_core.documents import Document
|
||||
|
||||
class Response:
|
||||
def __init__(self) -> None:
|
||||
self.succeeded: bool = True
|
||||
|
||||
def mock_upload_documents(self, documents: List[object]) -> List[Response]: # type: ignore[no-untyped-def]
|
||||
# assume all documents uploaded successfuly
|
||||
response = [Response() for _ in documents]
|
||||
return response
|
||||
|
||||
documents = [
|
||||
Document(
|
||||
page_content="page zero Lorem Ipsum",
|
||||
metadata={"source": "document.pdf", "page": 0, "id": "ID-document-1"},
|
||||
),
|
||||
Document(
|
||||
page_content="page one Lorem Ipsum",
|
||||
metadata={"source": "document.pdf", "page": 1, "id": "ID-document-2"},
|
||||
),
|
||||
]
|
||||
ids_provided = [i.metadata.get("id") for i in documents]
|
||||
|
||||
with patch.object(
|
||||
SearchClient, "upload_documents", mock_upload_documents
|
||||
), patch.object(SearchIndexClient, "get_index", mock_default_index):
|
||||
vector_store = create_vector_store()
|
||||
ids_used_at_upload = vector_store.add_documents(documents, ids=ids_provided)
|
||||
assert len(ids_provided) == len(ids_used_at_upload)
|
||||
assert ids_provided == ids_used_at_upload
|
||||
|
Loading…
Reference in New Issue
Block a user