feat: faiss filter from list (#6537)

### Feature

Using FAISS on a retrievalQA task, I found myself wanting to allow in
multiple sources. From what I understood, the filter feature takes in a
dict of form {key: value} which then will check in the metadata for the
exact value linked to that key.
I added some logic to be able to pass a list which will be checked
against instead of an exact value. Passing an exact value will also
work.

Here's an example of how I could then use it in my own project:

```
    pdfs_to_filter_in = ["file_A", "file_B"]
    filter_dict = {
        "source": [f"source_pdfs/{pdf_name}.pdf" for pdf_name in pdfs_to_filter_in]
    }
    retriever = db.as_retriever()
    retriever.search_kwargs = {"filter": filter_dict}
```

I added an integration test based on the other ones I found in
`tests/integration_tests/vectorstores/test_faiss.py` under
`test_faiss_with_metadatas_and_list_filter()`.

It doesn't feel like this is worthy of its own notebook or doc, but I'm
open to suggestions if needed.

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
multi_strategy_parser
HenriZuber 11 months ago committed by GitHub
parent 00a7403236
commit e0605b464b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -192,7 +192,7 @@ class FAISS(VectorStore):
Args:
embedding: Embedding vector to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None.
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
Defaults to 20.
**kwargs: kwargs to be passed to similarity search. Can include:
@ -218,7 +218,11 @@ class FAISS(VectorStore):
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
if filter is not None:
if all(doc.metadata.get(key) == value for key, value in filter.items()):
filter = {
key: [value] if not isinstance(value, list) else value
for key, value in filter.items()
}
if all(doc.metadata.get(key) in value for key, value in filter.items()):
docs.append((doc, scores[0][j]))
else:
docs.append((doc, scores[0][j]))

@ -96,6 +96,34 @@ def test_faiss_with_metadatas_and_filter() -> None:
assert output == [Document(page_content="bar", metadata={"page": 1})]
def test_faiss_with_metadatas_and_list_filter() -> None:
texts = ["foo", "bar", "baz", "foo", "qux"]
metadatas = [{"page": i} if i <= 3 else {"page": 3} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
expected_docstore = InMemoryDocstore(
{
docsearch.index_to_docstore_id[0]: Document(
page_content="foo", metadata={"page": 0}
),
docsearch.index_to_docstore_id[1]: Document(
page_content="bar", metadata={"page": 1}
),
docsearch.index_to_docstore_id[2]: Document(
page_content="baz", metadata={"page": 2}
),
docsearch.index_to_docstore_id[3]: Document(
page_content="foo", metadata={"page": 3}
),
docsearch.index_to_docstore_id[4]: Document(
page_content="qux", metadata={"page": 3}
),
}
)
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
output = docsearch.similarity_search("foor", k=1, filter={"page": [0, 1, 2]})
assert output == [Document(page_content="foo", metadata={"page": 0})]
def test_faiss_search_not_found() -> None:
"""Test what happens when document is not found."""
texts = ["foo", "bar", "baz"]

Loading…
Cancel
Save