From 4f19ba306597eb753ea397d4b646dc75c2668cbe Mon Sep 17 00:00:00 2001 From: Sam Chou Date: Tue, 19 Sep 2023 16:10:29 -0700 Subject: [PATCH] Azure Search: Remove select field restrictions and expand metadata to other fields, also expose kwargs to searches (#9894) Description: If metadata field returned in results, previous behavior unchanged. If metadata field does not exist in results, expand metadata to any fields returned outside of content field. There's precedence for this as well, see the retriever: https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/retrievers/azure_cognitive_search.py#L96C46-L96C46 Issue: #9765 - Ameliorates hard-coding in case you already indexed to cognitive search without a metadata field but rather placed metadata in separate fields. @hwchase17 --- .../langchain/vectorstores/azuresearch.py | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/azuresearch.py b/libs/langchain/langchain/vectorstores/azuresearch.py index da71a81bfb..9883646ecb 100644 --- a/libs/langchain/langchain/vectorstores/azuresearch.py +++ b/libs/langchain/langchain/vectorstores/azuresearch.py @@ -378,15 +378,18 @@ class AzureSearch(VectorStore): fields=FIELDS_CONTENT_VECTOR, ) ], - select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA], filter=filters, ) # Convert results to Document objects docs = [ ( Document( - page_content=result[FIELDS_CONTENT], - metadata=json.loads(result[FIELDS_METADATA]), + page_content=result.pop(FIELDS_CONTENT), + metadata=json.loads(result[FIELDS_METADATA]) + if FIELDS_METADATA in result + else { + k: v for k, v in result.items() if k != FIELDS_CONTENT_VECTOR + }, ), float(result["@search.score"]), ) @@ -435,7 +438,6 @@ class AzureSearch(VectorStore): fields=FIELDS_CONTENT_VECTOR, ) ], - select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA], filter=filters, top=k, ) @@ -443,8 +445,12 @@ class AzureSearch(VectorStore): docs = [ ( Document( - page_content=result[FIELDS_CONTENT], - metadata=json.loads(result[FIELDS_METADATA]), + page_content=result.pop(FIELDS_CONTENT), + metadata=json.loads(result[FIELDS_METADATA]) + if FIELDS_METADATA in result + else { + k: v for k, v in result.items() if k != FIELDS_CONTENT_VECTOR + }, ), float(result["@search.score"]), ) @@ -495,7 +501,6 @@ class AzureSearch(VectorStore): fields=FIELDS_CONTENT_VECTOR, ) ], - select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA], filter=filters, query_type="semantic", query_language=self.semantic_query_language, @@ -516,9 +521,17 @@ class AzureSearch(VectorStore): docs = [ ( Document( - page_content=result["content"], + page_content=result.pop(FIELDS_CONTENT), metadata={ - **json.loads(result["metadata"]), + **( + json.loads(result[FIELDS_METADATA]) + if FIELDS_METADATA in result + else { + k: v + for k, v in result.items() + if k != FIELDS_CONTENT_VECTOR + } + ), **{ "captions": { "text": result.get("@search.captions", [{}])[0].text, @@ -568,7 +581,7 @@ class AzureSearchVectorStoreRetriever(BaseRetriever): vectorstore: AzureSearch """Azure Search instance used to find similar documents.""" search_type: str = "hybrid" - """Type of search to perform. Options are "similarity", "hybrid", + """Type of search to perform. Options are "similarity", "hybrid", "semantic_hybrid".""" k: int = 4 """Number of documents to return.""" @@ -590,15 +603,15 @@ class AzureSearchVectorStoreRetriever(BaseRetriever): def _get_relevant_documents( self, query: str, - *, run_manager: CallbackManagerForRetrieverRun, + **kwargs: Any, ) -> List[Document]: if self.search_type == "similarity": - docs = self.vectorstore.vector_search(query, k=self.k) + docs = self.vectorstore.vector_search(query, k=self.k, **kwargs) elif self.search_type == "hybrid": - docs = self.vectorstore.hybrid_search(query, k=self.k) + docs = self.vectorstore.hybrid_search(query, k=self.k, **kwargs) elif self.search_type == "semantic_hybrid": - docs = self.vectorstore.semantic_hybrid_search(query, k=self.k) + docs = self.vectorstore.semantic_hybrid_search(query, k=self.k, **kwargs) else: raise ValueError(f"search_type of {self.search_type} not allowed.") return docs