mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Bagatur/filter metadata (#9015)
Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
a429145420
commit
3b754b5461
@ -205,12 +205,22 @@ class Chroma(VectorStore):
|
||||
[embeddings[idx] for idx in non_empty_ids] if embeddings else None
|
||||
)
|
||||
ids_with_metadata = [ids[idx] for idx in non_empty_ids]
|
||||
self._collection.upsert(
|
||||
metadatas=metadatas,
|
||||
embeddings=embeddings_with_metadatas,
|
||||
documents=texts_with_metadatas,
|
||||
ids=ids_with_metadata,
|
||||
)
|
||||
try:
|
||||
self._collection.upsert(
|
||||
metadatas=metadatas,
|
||||
embeddings=embeddings_with_metadatas,
|
||||
documents=texts_with_metadatas,
|
||||
ids=ids_with_metadata,
|
||||
)
|
||||
except ValueError as e:
|
||||
if "Expected metadata value to be" in str(e):
|
||||
msg = (
|
||||
"Try filtering complex metadata from the document using "
|
||||
"langchain.vectorstore.utils.filter_complex_metadata."
|
||||
)
|
||||
raise ValueError(e.args[0] + "\n\n" + msg)
|
||||
else:
|
||||
raise e
|
||||
if empty_ids:
|
||||
texts_without_metadatas = [texts[j] for j in empty_ids]
|
||||
embeddings_without_metadatas = (
|
||||
|
@ -1,10 +1,11 @@
|
||||
"""Utility functions for working with vectors and vectorstores."""
|
||||
|
||||
from enum import Enum
|
||||
from typing import List
|
||||
from typing import List, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.utils.math import cosine_similarity
|
||||
|
||||
|
||||
@ -51,3 +52,23 @@ def maximal_marginal_relevance(
|
||||
idxs.append(idx_to_add)
|
||||
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
||||
return idxs
|
||||
|
||||
|
||||
def filter_complex_metadata(
|
||||
documents: List[Document],
|
||||
*,
|
||||
allowed_types: Tuple[Type, ...] = (str, bool, int, float)
|
||||
) -> List[Document]:
|
||||
"""Filter out metadata types that are not supported for a vector store."""
|
||||
updated_documents = []
|
||||
for document in documents:
|
||||
filtered_metadata = {}
|
||||
for key, value in document.metadata.items():
|
||||
if not isinstance(value, allowed_types):
|
||||
continue
|
||||
filtered_metadata[key] = value
|
||||
|
||||
document.metadata = filtered_metadata
|
||||
updated_documents.append(document)
|
||||
|
||||
return updated_documents
|
||||
|
@ -1,7 +1,11 @@
|
||||
"""Test vector store utility functions."""
|
||||
import numpy as np
|
||||
|
||||
from langchain.vectorstores.utils import maximal_marginal_relevance
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.vectorstores.utils import (
|
||||
filter_complex_metadata,
|
||||
maximal_marginal_relevance,
|
||||
)
|
||||
|
||||
|
||||
def test_maximal_marginal_relevance_lambda_zero() -> None:
|
||||
@ -52,3 +56,70 @@ def test_maximal_marginal_relevance_query_dim() -> None:
|
||||
first = maximal_marginal_relevance(query_embedding, embedding_list)
|
||||
second = maximal_marginal_relevance(query_embedding_2d, embedding_list)
|
||||
assert first == second
|
||||
|
||||
|
||||
def test_filter_list_metadata() -> None:
|
||||
documents = [
|
||||
Document(
|
||||
page_content="",
|
||||
metadata={
|
||||
"key1": "this is a string!",
|
||||
"key2": ["a", "list", "of", "strings"],
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="",
|
||||
metadata={
|
||||
"key1": "this is another string!",
|
||||
"key2": {"foo"},
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="",
|
||||
metadata={
|
||||
"key1": "this is another string!",
|
||||
"key2": {"foo": "bar"},
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="",
|
||||
metadata={
|
||||
"key1": "this is another string!",
|
||||
"key2": True,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="",
|
||||
metadata={
|
||||
"key1": "this is another string!",
|
||||
"key2": 1,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="",
|
||||
metadata={
|
||||
"key1": "this is another string!",
|
||||
"key2": 1.0,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="",
|
||||
metadata={
|
||||
"key1": "this is another string!",
|
||||
"key2": "foo",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
updated_documents = filter_complex_metadata(documents)
|
||||
filtered_metadata = [doc.metadata for doc in updated_documents]
|
||||
|
||||
assert filtered_metadata == [
|
||||
{"key1": "this is a string!"},
|
||||
{"key1": "this is another string!"},
|
||||
{"key1": "this is another string!"},
|
||||
{"key1": "this is another string!", "key2": True},
|
||||
{"key1": "this is another string!", "key2": 1},
|
||||
{"key1": "this is another string!", "key2": 1.0},
|
||||
{"key1": "this is another string!", "key2": "foo"},
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user