forked from Archives/langchain
Improve pinecone hybrid search retriever adding metadata support (#5098)
# Improve pinecone hybrid search retriever adding metadata support I simply remove the hardwiring of metadata to the existing implementation allowing one to pass `metadatas` attribute to the constructors and in `get_relevant_documents`. I also add one missing pip install to the accompanying notebook (I am not adding dependencies, they were pre-existing). First contribution, just hoping to help, feel free to critique :) my twitter username is `@andreliebschner` While looking at hybrid search I noticed #3043 and #1743. I think the former can be closed as following the example right now (even prior to my improvements) works just fine, the latter I think can be also closed safely, maybe pointing out the relevant classes and example. Should I reply those issues mentioning someone? @dev2049, @hwchase17 --------- Co-authored-by: Andreas Liebschner <a.liebschner@shopfully.com>
This commit is contained in:
parent
5cd12102be
commit
44dc959584
@ -24,7 +24,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install pinecone-client"
|
||||
"#!pip install pinecone-client pinecone-text"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -18,6 +18,7 @@ def create_index(
|
||||
embeddings: Embeddings,
|
||||
sparse_encoder: Any,
|
||||
ids: Optional[List[str]] = None,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
) -> None:
|
||||
batch_size = 32
|
||||
_iterator = range(0, len(contexts), batch_size)
|
||||
@ -38,8 +39,15 @@ def create_index(
|
||||
# extract batch
|
||||
context_batch = contexts[i:i_end]
|
||||
batch_ids = ids[i:i_end]
|
||||
metadata_batch = (
|
||||
metadatas[i:i_end] if metadatas else [{} for _ in context_batch]
|
||||
)
|
||||
# add context passages as metadata
|
||||
meta = [{"context": context} for context in context_batch]
|
||||
meta = [
|
||||
{"context": context, **metadata}
|
||||
for context, metadata in zip(context_batch, metadata_batch)
|
||||
]
|
||||
|
||||
# create dense vectors
|
||||
dense_embeds = embeddings.embed_documents(context_batch)
|
||||
# create sparse vectors
|
||||
@ -78,8 +86,20 @@ class PineconeHybridSearchRetriever(BaseRetriever, BaseModel):
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
def add_texts(self, texts: List[str], ids: Optional[List[str]] = None) -> None:
|
||||
create_index(texts, self.index, self.embeddings, self.sparse_encoder, ids=ids)
|
||||
def add_texts(
|
||||
self,
|
||||
texts: List[str],
|
||||
ids: Optional[List[str]] = None,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
) -> None:
|
||||
create_index(
|
||||
texts,
|
||||
self.index,
|
||||
self.embeddings,
|
||||
self.sparse_encoder,
|
||||
ids=ids,
|
||||
metadatas=metadatas,
|
||||
)
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
@ -114,7 +134,10 @@ class PineconeHybridSearchRetriever(BaseRetriever, BaseModel):
|
||||
)
|
||||
final_result = []
|
||||
for res in result["matches"]:
|
||||
final_result.append(Document(page_content=res["metadata"]["context"]))
|
||||
context = res["metadata"].pop("context")
|
||||
final_result.append(
|
||||
Document(page_content=context, metadata=res["metadata"])
|
||||
)
|
||||
# return search results as json
|
||||
return final_result
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user