mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
bug_fixes: use md5 instead of uuid id generation (#3442)
At present, the method of generating `point` in qdrant is to use random `uuid`. The problem with this approach is that even documents with the same content will be inserted repeatedly instead of updated. Using `md5` as the `ID` of `point` to insert text can achieve true `update or insert`. Co-authored-by: mayue <mayue05@qiyi.com>
This commit is contained in:
parent
b765805964
commit
344e3508b1
@ -2,6 +2,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from hashlib import md5
|
||||
from operator import itemgetter
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
@ -78,7 +79,7 @@ class Qdrant(VectorStore):
|
||||
"""
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
ids = [uuid.uuid4().hex for _ in texts]
|
||||
ids = [md5(text.encode("utf-8")).hexdigest() for text in texts]
|
||||
self.client.upsert(
|
||||
collection_name=self.collection_name,
|
||||
points=rest.Batch.construct(
|
||||
@ -325,7 +326,7 @@ class Qdrant(VectorStore):
|
||||
client.upsert(
|
||||
collection_name=collection_name,
|
||||
points=rest.Batch.construct(
|
||||
ids=[uuid.uuid4().hex for _ in texts],
|
||||
ids=[md5(text.encode("utf-8")).hexdigest() for text in texts],
|
||||
vectors=embeddings,
|
||||
payloads=cls._build_payloads(
|
||||
texts, metadatas, content_payload_key, metadata_payload_key
|
||||
|
Loading…
Reference in New Issue
Block a user