community[patch]: Added missing from_documents method to KNNRetriever. (#18411)

- Description: Added missing `from_documents` method to `KNNRetriever`,
providing the ability to supply metadata to LangChain `Document`s, and
to give it parity to the other retrievers, which do have
`from_documents`.
- Issue: None
- Dependencies: None
- Twitter handle: None

Co-authored-by: Victor Adan <vadan@netroadshow.com>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
pull/18218/head^2
Victor Adan 3 months ago committed by GitHub
parent dfc4177b50
commit afa2d85405
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -5,7 +5,7 @@ https://github.com/karpathy/randomfun/blob/master/knn_vs_svm.ipynb"""
from __future__ import annotations
import concurrent.futures
from typing import Any, List, Optional
from typing import Any, Iterable, List, Optional
import numpy as np
from langchain_core.callbacks import CallbackManagerForRetrieverRun
@ -38,6 +38,8 @@ class KNNRetriever(BaseRetriever):
"""Index of embeddings."""
texts: List[str]
"""List of texts to index."""
metadatas: Optional[List[dict]] = None
"""List of metadatas corresponding with each text."""
k: int = 4
"""Number of results to return."""
relevancy_threshold: Optional[float] = None
@ -51,10 +53,32 @@ class KNNRetriever(BaseRetriever):
@classmethod
def from_texts(
cls, texts: List[str], embeddings: Embeddings, **kwargs: Any
cls,
texts: List[str],
embeddings: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> KNNRetriever:
index = create_index(texts, embeddings)
return cls(embeddings=embeddings, index=index, texts=texts, **kwargs)
return cls(
embeddings=embeddings,
index=index,
texts=texts,
metadatas=metadatas,
**kwargs,
)
@classmethod
def from_documents(
cls,
documents: Iterable[Document],
embeddings: Embeddings,
**kwargs: Any,
) -> KNNRetriever:
texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
return cls.from_texts(
texts=texts, embeddings=embeddings, metadatas=metadatas, **kwargs
)
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
@ -71,7 +95,10 @@ class KNNRetriever(BaseRetriever):
normalized_similarities = (similarities - np.min(similarities)) / denominator
top_k_results = [
Document(page_content=self.texts[row])
Document(
page_content=self.texts[row],
metadata=self.metadatas[row] if self.metadatas else {},
)
for row in sorted_ix[0 : self.k]
if (
self.relevancy_threshold is None

@ -1,3 +1,5 @@
from langchain_core.documents import Document
from langchain_community.embeddings import FakeEmbeddings
from langchain_community.retrievers.knn import KNNRetriever
@ -9,3 +11,19 @@ class TestKNNRetriever:
texts=input_texts, embeddings=FakeEmbeddings(size=100)
)
assert len(knn_retriever.texts) == 3
def test_from_documents(self) -> None:
input_docs = [
Document(page_content="I have a pen.", metadata={"page": 1}),
Document(page_content="Do you have a pen?", metadata={"page": 2}),
Document(page_content="I have a bag.", metadata={"page": 3}),
]
knn_retriever = KNNRetriever.from_documents(
documents=input_docs, embeddings=FakeEmbeddings(size=100)
)
assert knn_retriever.texts == [
"I have a pen.",
"Do you have a pen?",
"I have a bag.",
]
assert knn_retriever.metadatas == [{"page": 1}, {"page": 2}, {"page": 3}]

Loading…
Cancel
Save