bug_fixes: use md5 instead of uuid id generation (#3442)

At present, the method of generating `point` in qdrant is to use random
`uuid`. The problem with this approach is that even documents with the
same content will be inserted repeatedly instead of updated. Using `md5`
as the `ID` of `point` to insert text can achieve true `update or
insert`.

Co-authored-by: mayue <mayue05@qiyi.com>
This commit is contained in:
killpanda 2023-04-25 12:39:51 +08:00 committed by GitHub
parent b765805964
commit 344e3508b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2,6 +2,7 @@
from __future__ import annotations
import uuid
from hashlib import md5
from operator import itemgetter
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
@ -78,7 +79,7 @@ class Qdrant(VectorStore):
"""
from qdrant_client.http import models as rest
ids = [uuid.uuid4().hex for _ in texts]
ids = [md5(text.encode("utf-8")).hexdigest() for text in texts]
self.client.upsert(
collection_name=self.collection_name,
points=rest.Batch.construct(
@ -325,7 +326,7 @@ class Qdrant(VectorStore):
client.upsert(
collection_name=collection_name,
points=rest.Batch.construct(
ids=[uuid.uuid4().hex for _ in texts],
ids=[md5(text.encode("utf-8")).hexdigest() for text in texts],
vectors=embeddings,
payloads=cls._build_payloads(
texts, metadatas, content_payload_key, metadata_payload_key