From 44dc9595848f2d126a9788e121438d189a48f143 Mon Sep 17 00:00:00 2001 From: Andreas Liebschner Date: Mon, 22 May 2023 20:42:54 +0200 Subject: [PATCH] Improve pinecone hybrid search retriever adding metadata support (#5098) # Improve pinecone hybrid search retriever adding metadata support I simply remove the hardwiring of metadata to the existing implementation allowing one to pass `metadatas` attribute to the constructors and in `get_relevant_documents`. I also add one missing pip install to the accompanying notebook (I am not adding dependencies, they were pre-existing). First contribution, just hoping to help, feel free to critique :) my twitter username is `@andreliebschner` While looking at hybrid search I noticed #3043 and #1743. I think the former can be closed as following the example right now (even prior to my improvements) works just fine, the latter I think can be also closed safely, maybe pointing out the relevant classes and example. Should I reply those issues mentioning someone? @dev2049, @hwchase17 --------- Co-authored-by: Andreas Liebschner --- .../examples/pinecone_hybrid_search.ipynb | 2 +- .../retrievers/pinecone_hybrid_search.py | 31 ++++++++++++++++--- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/docs/modules/indexes/retrievers/examples/pinecone_hybrid_search.ipynb b/docs/modules/indexes/retrievers/examples/pinecone_hybrid_search.ipynb index 1e07b4a8..9d3fa491 100644 --- a/docs/modules/indexes/retrievers/examples/pinecone_hybrid_search.ipynb +++ b/docs/modules/indexes/retrievers/examples/pinecone_hybrid_search.ipynb @@ -24,7 +24,7 @@ "metadata": {}, "outputs": [], "source": [ - "#!pip install pinecone-client" + "#!pip install pinecone-client pinecone-text" ] }, { diff --git a/langchain/retrievers/pinecone_hybrid_search.py b/langchain/retrievers/pinecone_hybrid_search.py index c4ad39e5..bd04a296 100644 --- a/langchain/retrievers/pinecone_hybrid_search.py +++ b/langchain/retrievers/pinecone_hybrid_search.py @@ -18,6 +18,7 @@ def create_index( embeddings: Embeddings, sparse_encoder: Any, ids: Optional[List[str]] = None, + metadatas: Optional[List[dict]] = None, ) -> None: batch_size = 32 _iterator = range(0, len(contexts), batch_size) @@ -38,8 +39,15 @@ def create_index( # extract batch context_batch = contexts[i:i_end] batch_ids = ids[i:i_end] + metadata_batch = ( + metadatas[i:i_end] if metadatas else [{} for _ in context_batch] + ) # add context passages as metadata - meta = [{"context": context} for context in context_batch] + meta = [ + {"context": context, **metadata} + for context, metadata in zip(context_batch, metadata_batch) + ] + # create dense vectors dense_embeds = embeddings.embed_documents(context_batch) # create sparse vectors @@ -78,8 +86,20 @@ class PineconeHybridSearchRetriever(BaseRetriever, BaseModel): extra = Extra.forbid arbitrary_types_allowed = True - def add_texts(self, texts: List[str], ids: Optional[List[str]] = None) -> None: - create_index(texts, self.index, self.embeddings, self.sparse_encoder, ids=ids) + def add_texts( + self, + texts: List[str], + ids: Optional[List[str]] = None, + metadatas: Optional[List[dict]] = None, + ) -> None: + create_index( + texts, + self.index, + self.embeddings, + self.sparse_encoder, + ids=ids, + metadatas=metadatas, + ) @root_validator() def validate_environment(cls, values: Dict) -> Dict: @@ -114,7 +134,10 @@ class PineconeHybridSearchRetriever(BaseRetriever, BaseModel): ) final_result = [] for res in result["matches"]: - final_result.append(Document(page_content=res["metadata"]["context"])) + context = res["metadata"].pop("context") + final_result.append( + Document(page_content=context, metadata=res["metadata"]) + ) # return search results as json return final_result