mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
bug_fixes: use md5 instead of uuid id generation (#3442)
At present, the method of generating `point` in qdrant is to use random `uuid`. The problem with this approach is that even documents with the same content will be inserted repeatedly instead of updated. Using `md5` as the `ID` of `point` to insert text can achieve true `update or insert`. Co-authored-by: mayue <mayue05@qiyi.com>
This commit is contained in:
parent
b765805964
commit
344e3508b1
@ -2,6 +2,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
from hashlib import md5
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
@ -78,7 +79,7 @@ class Qdrant(VectorStore):
|
|||||||
"""
|
"""
|
||||||
from qdrant_client.http import models as rest
|
from qdrant_client.http import models as rest
|
||||||
|
|
||||||
ids = [uuid.uuid4().hex for _ in texts]
|
ids = [md5(text.encode("utf-8")).hexdigest() for text in texts]
|
||||||
self.client.upsert(
|
self.client.upsert(
|
||||||
collection_name=self.collection_name,
|
collection_name=self.collection_name,
|
||||||
points=rest.Batch.construct(
|
points=rest.Batch.construct(
|
||||||
@ -325,7 +326,7 @@ class Qdrant(VectorStore):
|
|||||||
client.upsert(
|
client.upsert(
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
points=rest.Batch.construct(
|
points=rest.Batch.construct(
|
||||||
ids=[uuid.uuid4().hex for _ in texts],
|
ids=[md5(text.encode("utf-8")).hexdigest() for text in texts],
|
||||||
vectors=embeddings,
|
vectors=embeddings,
|
||||||
payloads=cls._build_payloads(
|
payloads=cls._build_payloads(
|
||||||
texts, metadatas, content_payload_key, metadata_payload_key
|
texts, metadatas, content_payload_key, metadata_payload_key
|
||||||
|
Loading…
Reference in New Issue
Block a user