mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
fix empty ids when metadatas is provided (#8127)
Fixes https://github.com/hwchase17/langchain/issues/7865 and https://github.com/hwchase17/langchain/issues/8061 - [x] fixes returning empty ids when metadatas argument is provided @baskaryan --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
62b8b459c6
commit
b7d6e1909c
@ -171,38 +171,52 @@ class Chroma(VectorStore):
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
embeddings = None
|
||||
texts = list(texts)
|
||||
if self._embedding_function is not None:
|
||||
embeddings = self._embedding_function.embed_documents(list(texts))
|
||||
|
||||
embeddings = self._embedding_function.embed_documents(texts)
|
||||
if metadatas:
|
||||
texts = list(texts)
|
||||
empty = []
|
||||
non_empty = []
|
||||
for i, m in enumerate(metadatas):
|
||||
# fill metadatas with empty dicts if somebody
|
||||
# did not specify metadata for all texts
|
||||
length_diff = len(texts) - len(metadatas)
|
||||
if length_diff:
|
||||
metadatas = metadatas + [{}] * length_diff
|
||||
empty_ids = []
|
||||
non_empty_ids = []
|
||||
for idx, m in enumerate(metadatas):
|
||||
if m:
|
||||
non_empty.append(i)
|
||||
non_empty_ids.append(idx)
|
||||
else:
|
||||
empty.append(i)
|
||||
if non_empty:
|
||||
metadatas = [metadatas[i] for i in non_empty]
|
||||
texts_with_metadatas = [texts[i] for i in non_empty]
|
||||
empty_ids.append(idx)
|
||||
if non_empty_ids:
|
||||
metadatas = [metadatas[idx] for idx in non_empty_ids]
|
||||
texts_with_metadatas = [texts[idx] for idx in non_empty_ids]
|
||||
embeddings_with_metadatas = (
|
||||
[embeddings[i] for i in non_empty] if embeddings else None
|
||||
[embeddings[idx] for idx in non_empty_ids] if embeddings else None
|
||||
)
|
||||
ids_with_metadata = [ids[i] for i in non_empty]
|
||||
ids_with_metadata = [ids[idx] for idx in non_empty_ids]
|
||||
self._collection.upsert(
|
||||
metadatas=metadatas,
|
||||
embeddings=embeddings_with_metadatas,
|
||||
documents=texts_with_metadatas,
|
||||
ids=ids_with_metadata,
|
||||
)
|
||||
|
||||
texts = [texts[j] for j in empty]
|
||||
embeddings = [embeddings[j] for j in empty] if embeddings else None
|
||||
ids = [ids[j] for j in empty]
|
||||
|
||||
if texts:
|
||||
self._collection.upsert(embeddings=embeddings, documents=texts, ids=ids)
|
||||
if empty_ids:
|
||||
texts_without_metadatas = [texts[j] for j in empty_ids]
|
||||
embeddings_without_metadatas = (
|
||||
[embeddings[j] for j in empty_ids] if embeddings else None
|
||||
)
|
||||
ids_without_metadatas = [ids[j] for j in empty_ids]
|
||||
self._collection.upsert(
|
||||
embeddings=embeddings_without_metadatas,
|
||||
documents=texts_without_metadatas,
|
||||
ids=ids_without_metadatas,
|
||||
)
|
||||
else:
|
||||
self._collection.upsert(
|
||||
embeddings=embeddings,
|
||||
documents=texts,
|
||||
ids=ids,
|
||||
)
|
||||
return ids
|
||||
|
||||
def similarity_search(
|
||||
|
@ -294,7 +294,9 @@ def test_chroma_add_documents_mixed_metadata() -> None:
|
||||
Document(page_content="foo"),
|
||||
Document(page_content="bar", metadata={"baz": 1}),
|
||||
]
|
||||
db.add_documents(docs)
|
||||
ids = ["0", "1"]
|
||||
actual_ids = db.add_documents(docs, ids=ids)
|
||||
assert actual_ids == ids
|
||||
search = db.similarity_search("foo bar")
|
||||
assert sorted(search, key=lambda d: d.page_content) == sorted(
|
||||
docs, key=lambda d: d.page_content
|
||||
|
Loading…
Reference in New Issue
Block a user