Bagatur/filter metadata (#9015)

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
Bagatur 2023-08-11 01:10:00 -07:00 committed by GitHub
parent a429145420
commit 3b754b5461
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 110 additions and 8 deletions

View File

@ -205,12 +205,22 @@ class Chroma(VectorStore):
[embeddings[idx] for idx in non_empty_ids] if embeddings else None
)
ids_with_metadata = [ids[idx] for idx in non_empty_ids]
self._collection.upsert(
metadatas=metadatas,
embeddings=embeddings_with_metadatas,
documents=texts_with_metadatas,
ids=ids_with_metadata,
)
try:
self._collection.upsert(
metadatas=metadatas,
embeddings=embeddings_with_metadatas,
documents=texts_with_metadatas,
ids=ids_with_metadata,
)
except ValueError as e:
if "Expected metadata value to be" in str(e):
msg = (
"Try filtering complex metadata from the document using "
"langchain.vectorstore.utils.filter_complex_metadata."
)
raise ValueError(e.args[0] + "\n\n" + msg)
else:
raise e
if empty_ids:
texts_without_metadatas = [texts[j] for j in empty_ids]
embeddings_without_metadatas = (

View File

@ -1,10 +1,11 @@
"""Utility functions for working with vectors and vectorstores."""
from enum import Enum
from typing import List
from typing import List, Tuple, Type
import numpy as np
from langchain.docstore.document import Document
from langchain.utils.math import cosine_similarity
@ -51,3 +52,23 @@ def maximal_marginal_relevance(
idxs.append(idx_to_add)
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
return idxs
def filter_complex_metadata(
documents: List[Document],
*,
allowed_types: Tuple[Type, ...] = (str, bool, int, float)
) -> List[Document]:
"""Filter out metadata types that are not supported for a vector store."""
updated_documents = []
for document in documents:
filtered_metadata = {}
for key, value in document.metadata.items():
if not isinstance(value, allowed_types):
continue
filtered_metadata[key] = value
document.metadata = filtered_metadata
updated_documents.append(document)
return updated_documents

View File

@ -1,7 +1,11 @@
"""Test vector store utility functions."""
import numpy as np
from langchain.vectorstores.utils import maximal_marginal_relevance
from langchain.docstore.document import Document
from langchain.vectorstores.utils import (
filter_complex_metadata,
maximal_marginal_relevance,
)
def test_maximal_marginal_relevance_lambda_zero() -> None:
@ -52,3 +56,70 @@ def test_maximal_marginal_relevance_query_dim() -> None:
first = maximal_marginal_relevance(query_embedding, embedding_list)
second = maximal_marginal_relevance(query_embedding_2d, embedding_list)
assert first == second
def test_filter_list_metadata() -> None:
documents = [
Document(
page_content="",
metadata={
"key1": "this is a string!",
"key2": ["a", "list", "of", "strings"],
},
),
Document(
page_content="",
metadata={
"key1": "this is another string!",
"key2": {"foo"},
},
),
Document(
page_content="",
metadata={
"key1": "this is another string!",
"key2": {"foo": "bar"},
},
),
Document(
page_content="",
metadata={
"key1": "this is another string!",
"key2": True,
},
),
Document(
page_content="",
metadata={
"key1": "this is another string!",
"key2": 1,
},
),
Document(
page_content="",
metadata={
"key1": "this is another string!",
"key2": 1.0,
},
),
Document(
page_content="",
metadata={
"key1": "this is another string!",
"key2": "foo",
},
),
]
updated_documents = filter_complex_metadata(documents)
filtered_metadata = [doc.metadata for doc in updated_documents]
assert filtered_metadata == [
{"key1": "this is a string!"},
{"key1": "this is another string!"},
{"key1": "this is another string!"},
{"key1": "this is another string!", "key2": True},
{"key1": "this is another string!", "key2": 1},
{"key1": "this is another string!", "key2": 1.0},
{"key1": "this is another string!", "key2": "foo"},
]