bug_fixes: use md5 instead of uuid id generation (#3442)

At present, the method of generating `point` in qdrant is to use random
`uuid`. The problem with this approach is that even documents with the
same content will be inserted repeatedly instead of updated. Using `md5`
as the `ID` of `point` to insert text can achieve true `update or
insert`.

Co-authored-by: mayue <mayue05@qiyi.com>
This commit is contained in:
killpanda 2023-04-25 12:39:51 +08:00 committed by GitHub
parent b765805964
commit 344e3508b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import uuid import uuid
from hashlib import md5
from operator import itemgetter from operator import itemgetter
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
@ -78,7 +79,7 @@ class Qdrant(VectorStore):
""" """
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
ids = [uuid.uuid4().hex for _ in texts] ids = [md5(text.encode("utf-8")).hexdigest() for text in texts]
self.client.upsert( self.client.upsert(
collection_name=self.collection_name, collection_name=self.collection_name,
points=rest.Batch.construct( points=rest.Batch.construct(
@ -325,7 +326,7 @@ class Qdrant(VectorStore):
client.upsert( client.upsert(
collection_name=collection_name, collection_name=collection_name,
points=rest.Batch.construct( points=rest.Batch.construct(
ids=[uuid.uuid4().hex for _ in texts], ids=[md5(text.encode("utf-8")).hexdigest() for text in texts],
vectors=embeddings, vectors=embeddings,
payloads=cls._build_payloads( payloads=cls._build_payloads(
texts, metadatas, content_payload_key, metadata_payload_key texts, metadatas, content_payload_key, metadata_payload_key