From e0605b464b5672e02da1bf8a0680d644bac758b6 Mon Sep 17 00:00:00 2001 From: HenriZuber Date: Wed, 21 Jun 2023 19:49:01 +0200 Subject: [PATCH] feat: faiss filter from list (#6537) ### Feature Using FAISS on a retrievalQA task, I found myself wanting to allow in multiple sources. From what I understood, the filter feature takes in a dict of form {key: value} which then will check in the metadata for the exact value linked to that key. I added some logic to be able to pass a list which will be checked against instead of an exact value. Passing an exact value will also work. Here's an example of how I could then use it in my own project: ``` pdfs_to_filter_in = ["file_A", "file_B"] filter_dict = { "source": [f"source_pdfs/{pdf_name}.pdf" for pdf_name in pdfs_to_filter_in] } retriever = db.as_retriever() retriever.search_kwargs = {"filter": filter_dict} ``` I added an integration test based on the other ones I found in `tests/integration_tests/vectorstores/test_faiss.py` under `test_faiss_with_metadatas_and_list_filter()`. It doesn't feel like this is worthy of its own notebook or doc, but I'm open to suggestions if needed. Co-authored-by: Dev 2049 --- langchain/vectorstores/faiss.py | 8 ++++-- .../vectorstores/test_faiss.py | 28 +++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/langchain/vectorstores/faiss.py b/langchain/vectorstores/faiss.py index 6dd7fd54..1489bf8f 100644 --- a/langchain/vectorstores/faiss.py +++ b/langchain/vectorstores/faiss.py @@ -192,7 +192,7 @@ class FAISS(VectorStore): Args: embedding: Embedding vector to look up documents similar to. k: Number of Documents to return. Defaults to 4. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None. fetch_k: (Optional[int]) Number of Documents to fetch before filtering. Defaults to 20. **kwargs: kwargs to be passed to similarity search. Can include: @@ -218,7 +218,11 @@ class FAISS(VectorStore): if not isinstance(doc, Document): raise ValueError(f"Could not find document for id {_id}, got {doc}") if filter is not None: - if all(doc.metadata.get(key) == value for key, value in filter.items()): + filter = { + key: [value] if not isinstance(value, list) else value + for key, value in filter.items() + } + if all(doc.metadata.get(key) in value for key, value in filter.items()): docs.append((doc, scores[0][j])) else: docs.append((doc, scores[0][j])) diff --git a/tests/integration_tests/vectorstores/test_faiss.py b/tests/integration_tests/vectorstores/test_faiss.py index d33e6d78..37a66e8e 100644 --- a/tests/integration_tests/vectorstores/test_faiss.py +++ b/tests/integration_tests/vectorstores/test_faiss.py @@ -96,6 +96,34 @@ def test_faiss_with_metadatas_and_filter() -> None: assert output == [Document(page_content="bar", metadata={"page": 1})] +def test_faiss_with_metadatas_and_list_filter() -> None: + texts = ["foo", "bar", "baz", "foo", "qux"] + metadatas = [{"page": i} if i <= 3 else {"page": 3} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + expected_docstore = InMemoryDocstore( + { + docsearch.index_to_docstore_id[0]: Document( + page_content="foo", metadata={"page": 0} + ), + docsearch.index_to_docstore_id[1]: Document( + page_content="bar", metadata={"page": 1} + ), + docsearch.index_to_docstore_id[2]: Document( + page_content="baz", metadata={"page": 2} + ), + docsearch.index_to_docstore_id[3]: Document( + page_content="foo", metadata={"page": 3} + ), + docsearch.index_to_docstore_id[4]: Document( + page_content="qux", metadata={"page": 3} + ), + } + ) + assert docsearch.docstore.__dict__ == expected_docstore.__dict__ + output = docsearch.similarity_search("foor", k=1, filter={"page": [0, 1, 2]}) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + def test_faiss_search_not_found() -> None: """Test what happens when document is not found.""" texts = ["foo", "bar", "baz"]