From 344e3508b15d8934e1ee7a5f14b9b9ec118d3379 Mon Sep 17 00:00:00 2001 From: killpanda Date: Tue, 25 Apr 2023 12:39:51 +0800 Subject: [PATCH] bug_fixes: use md5 instead of uuid id generation (#3442) At present, the method of generating `point` in qdrant is to use random `uuid`. The problem with this approach is that even documents with the same content will be inserted repeatedly instead of updated. Using `md5` as the `ID` of `point` to insert text can achieve true `update or insert`. Co-authored-by: mayue --- langchain/vectorstores/qdrant.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/langchain/vectorstores/qdrant.py b/langchain/vectorstores/qdrant.py index 0c0c2e19..33447561 100644 --- a/langchain/vectorstores/qdrant.py +++ b/langchain/vectorstores/qdrant.py @@ -2,6 +2,7 @@ from __future__ import annotations import uuid +from hashlib import md5 from operator import itemgetter from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, Union @@ -78,7 +79,7 @@ class Qdrant(VectorStore): """ from qdrant_client.http import models as rest - ids = [uuid.uuid4().hex for _ in texts] + ids = [md5(text.encode("utf-8")).hexdigest() for text in texts] self.client.upsert( collection_name=self.collection_name, points=rest.Batch.construct( @@ -325,7 +326,7 @@ class Qdrant(VectorStore): client.upsert( collection_name=collection_name, points=rest.Batch.construct( - ids=[uuid.uuid4().hex for _ in texts], + ids=[md5(text.encode("utf-8")).hexdigest() for text in texts], vectors=embeddings, payloads=cls._build_payloads( texts, metadatas, content_payload_key, metadata_payload_key