bug_fixes: use md5 instead of uuid id generation (#3442)

At present, the method of generating `point` in qdrant is to use random `uuid`. The problem with this approach is that even documents with the same content will be inserted repeatedly instead of updated. Using `md5` as the `ID` of `point` to insert text can achieve true `update or insert`. Co-authored-by: mayue <mayue05@qiyi.com>
2023-04-25 12:39:51 +08:00 · 2023-04-25 12:39:51 +08:00 · 344e3508b1
commit 344e3508b1
parent b765805964
1 changed files with 3 additions and 2 deletions
--- a/langchain/vectorstores/qdrant.py
+++ b/langchain/vectorstores/qdrant.py
@ -2,6 +2,7 @@
 from __future__ import annotations

 import uuid
+from hashlib import md5
 from operator import itemgetter
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union

@ -78,7 +79,7 @@ class Qdrant(VectorStore):
        """
        from qdrant_client.http import models as rest

-        ids = [uuid.uuid4().hex for _ in texts]
+        ids = [md5(text.encode("utf-8")).hexdigest() for text in texts]
        self.client.upsert(
            collection_name=self.collection_name,
            points=rest.Batch.construct(
@ -325,7 +326,7 @@ class Qdrant(VectorStore):
        client.upsert(
            collection_name=collection_name,
            points=rest.Batch.construct(
-                ids=[uuid.uuid4().hex for _ in texts],
+                ids=[md5(text.encode("utf-8")).hexdigest() for text in texts],
                vectors=embeddings,
                payloads=cls._build_payloads(
                    texts, metadatas, content_payload_key, metadata_payload_key