From 3b754b54618beafcc3bf42827c291ed553361ce9 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Fri, 11 Aug 2023 01:10:00 -0700 Subject: [PATCH] Bagatur/filter metadata (#9015) Co-authored-by: Matt Robinson --- .../langchain/vectorstores/chroma.py | 22 ++++-- .../langchain/langchain/vectorstores/utils.py | 23 +++++- .../unit_tests/vectorstores/test_utils.py | 73 ++++++++++++++++++- 3 files changed, 110 insertions(+), 8 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/chroma.py b/libs/langchain/langchain/vectorstores/chroma.py index 59edb26831..9457419c56 100644 --- a/libs/langchain/langchain/vectorstores/chroma.py +++ b/libs/langchain/langchain/vectorstores/chroma.py @@ -205,12 +205,22 @@ class Chroma(VectorStore): [embeddings[idx] for idx in non_empty_ids] if embeddings else None ) ids_with_metadata = [ids[idx] for idx in non_empty_ids] - self._collection.upsert( - metadatas=metadatas, - embeddings=embeddings_with_metadatas, - documents=texts_with_metadatas, - ids=ids_with_metadata, - ) + try: + self._collection.upsert( + metadatas=metadatas, + embeddings=embeddings_with_metadatas, + documents=texts_with_metadatas, + ids=ids_with_metadata, + ) + except ValueError as e: + if "Expected metadata value to be" in str(e): + msg = ( + "Try filtering complex metadata from the document using " + "langchain.vectorstore.utils.filter_complex_metadata." + ) + raise ValueError(e.args[0] + "\n\n" + msg) + else: + raise e if empty_ids: texts_without_metadatas = [texts[j] for j in empty_ids] embeddings_without_metadatas = ( diff --git a/libs/langchain/langchain/vectorstores/utils.py b/libs/langchain/langchain/vectorstores/utils.py index 539a1feda5..64ecf13771 100644 --- a/libs/langchain/langchain/vectorstores/utils.py +++ b/libs/langchain/langchain/vectorstores/utils.py @@ -1,10 +1,11 @@ """Utility functions for working with vectors and vectorstores.""" from enum import Enum -from typing import List +from typing import List, Tuple, Type import numpy as np +from langchain.docstore.document import Document from langchain.utils.math import cosine_similarity @@ -51,3 +52,23 @@ def maximal_marginal_relevance( idxs.append(idx_to_add) selected = np.append(selected, [embedding_list[idx_to_add]], axis=0) return idxs + + +def filter_complex_metadata( + documents: List[Document], + *, + allowed_types: Tuple[Type, ...] = (str, bool, int, float) +) -> List[Document]: + """Filter out metadata types that are not supported for a vector store.""" + updated_documents = [] + for document in documents: + filtered_metadata = {} + for key, value in document.metadata.items(): + if not isinstance(value, allowed_types): + continue + filtered_metadata[key] = value + + document.metadata = filtered_metadata + updated_documents.append(document) + + return updated_documents diff --git a/libs/langchain/tests/unit_tests/vectorstores/test_utils.py b/libs/langchain/tests/unit_tests/vectorstores/test_utils.py index 6ad76e424d..907ed2cde3 100644 --- a/libs/langchain/tests/unit_tests/vectorstores/test_utils.py +++ b/libs/langchain/tests/unit_tests/vectorstores/test_utils.py @@ -1,7 +1,11 @@ """Test vector store utility functions.""" import numpy as np -from langchain.vectorstores.utils import maximal_marginal_relevance +from langchain.docstore.document import Document +from langchain.vectorstores.utils import ( + filter_complex_metadata, + maximal_marginal_relevance, +) def test_maximal_marginal_relevance_lambda_zero() -> None: @@ -52,3 +56,70 @@ def test_maximal_marginal_relevance_query_dim() -> None: first = maximal_marginal_relevance(query_embedding, embedding_list) second = maximal_marginal_relevance(query_embedding_2d, embedding_list) assert first == second + + +def test_filter_list_metadata() -> None: + documents = [ + Document( + page_content="", + metadata={ + "key1": "this is a string!", + "key2": ["a", "list", "of", "strings"], + }, + ), + Document( + page_content="", + metadata={ + "key1": "this is another string!", + "key2": {"foo"}, + }, + ), + Document( + page_content="", + metadata={ + "key1": "this is another string!", + "key2": {"foo": "bar"}, + }, + ), + Document( + page_content="", + metadata={ + "key1": "this is another string!", + "key2": True, + }, + ), + Document( + page_content="", + metadata={ + "key1": "this is another string!", + "key2": 1, + }, + ), + Document( + page_content="", + metadata={ + "key1": "this is another string!", + "key2": 1.0, + }, + ), + Document( + page_content="", + metadata={ + "key1": "this is another string!", + "key2": "foo", + }, + ), + ] + + updated_documents = filter_complex_metadata(documents) + filtered_metadata = [doc.metadata for doc in updated_documents] + + assert filtered_metadata == [ + {"key1": "this is a string!"}, + {"key1": "this is another string!"}, + {"key1": "this is another string!"}, + {"key1": "this is another string!", "key2": True}, + {"key1": "this is another string!", "key2": 1}, + {"key1": "this is another string!", "key2": 1.0}, + {"key1": "this is another string!", "key2": "foo"}, + ]