From 75c0cadb508054a853025a895b6f03f9097d95c5 Mon Sep 17 00:00:00 2001 From: Anush008 Date: Wed, 28 Feb 2024 11:49:15 +0530 Subject: [PATCH 1/2] feat: Qdrant vector store --- application/core/settings.py | 38 ++++++++++++------ application/requirements.txt | 1 + application/vectorstore/qdrant.py | 47 +++++++++++++++++++++++ application/vectorstore/vector_creator.py | 10 +++-- 4 files changed, 81 insertions(+), 15 deletions(-) create mode 100644 application/vectorstore/qdrant.py diff --git a/application/core/settings.py b/application/core/settings.py index d9b68ed7..cd1ac047 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -3,6 +3,7 @@ from typing import Optional import os from pydantic_settings import BaseSettings + current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -15,7 +16,7 @@ class Settings(BaseSettings): MODEL_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf") TOKENS_MAX_HISTORY: int = 150 UPLOAD_FOLDER: str = "inputs" - VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" + VECTOR_STORE: str = "qdrant" # "faiss" or "elasticsearch" or "qdrant" API_URL: str = "http://localhost:7091" # backend url for celery worker @@ -27,21 +28,36 @@ class Settings(BaseSettings): AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for embeddings # elasticsearch - ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch - ELASTIC_USERNAME: Optional[str] = None # username for elasticsearch - ELASTIC_PASSWORD: Optional[str] = None # password for elasticsearch - ELASTIC_URL: Optional[str] = None # url for elasticsearch - ELASTIC_INDEX: Optional[str] = "docsgpt" # index name for elasticsearch + ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch + ELASTIC_USERNAME: Optional[str] = None # username for elasticsearch + ELASTIC_PASSWORD: Optional[str] = None # password for elasticsearch + ELASTIC_URL: Optional[str] = None # url for elasticsearch + ELASTIC_INDEX: Optional[str] = "docsgpt" # index name for elasticsearch # SageMaker config - SAGEMAKER_ENDPOINT: Optional[str] = None # SageMaker endpoint name - SAGEMAKER_REGION: Optional[str] = None # SageMaker region name - SAGEMAKER_ACCESS_KEY: Optional[str] = None # SageMaker access key - SAGEMAKER_SECRET_KEY: Optional[str] = None # SageMaker secret key + SAGEMAKER_ENDPOINT: Optional[str] = None # SageMaker endpoint name + SAGEMAKER_REGION: Optional[str] = None # SageMaker region name + SAGEMAKER_ACCESS_KEY: Optional[str] = None # SageMaker access key + SAGEMAKER_SECRET_KEY: Optional[str] = None # SageMaker secret key - # prem ai project id + # prem ai project id PREMAI_PROJECT_ID: Optional[str] = None + # Qdrant vectorstore config + QDRANT_COLLECTION_NAME: Optional[str] = "docsgpt" + QDRANT_LOCATION: Optional[str] = None + QDRANT_URL: Optional[str] = None + QDRANT_PORT: Optional[int] = 6333 + QDRANT_GRPC_PORT: int = 6334 + QDRANT_PREFER_GRPC: bool = False + QDRANT_HTTPS: Optional[bool] = None + QDRANT_API_KEY: Optional[str] = None + QDRANT_PREFIX: Optional[str] = None + QDRANT_TIMEOUT: Optional[float] = None + QDRANT_HOST: Optional[str] = None + QDRANT_PATH: Optional[str] = None + QDRANT_DISTANCE_FUNC: str = "Cosine" + path = Path(__file__).parent.parent.absolute() settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8") diff --git a/application/requirements.txt b/application/requirements.txt index 0bb34365..0874a7c9 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -21,6 +21,7 @@ pydantic_settings==2.1.0 pymongo==4.6.1 PyPDF2==3.0.1 python-dotenv==1.0.1 +qdrant-client==1.7.3 redis==5.0.1 Requests==2.31.0 retry==0.9.2 diff --git a/application/vectorstore/qdrant.py b/application/vectorstore/qdrant.py new file mode 100644 index 00000000..482d06a1 --- /dev/null +++ b/application/vectorstore/qdrant.py @@ -0,0 +1,47 @@ +from langchain_community.vectorstores.qdrant import Qdrant +from application.vectorstore.base import BaseVectorStore +from application.core.settings import settings +from qdrant_client import models + + +class QdrantStore(BaseVectorStore): + def __init__(self, path: str = "", embeddings_key: str = "embeddings"): + self._filter = models.Filter( + must=[ + models.FieldCondition( + key="metadata.store", + match=models.MatchValue(value=path.replace("application/indexes/", "").rstrip("/")), + ) + ] + ) + + self._docsearch = Qdrant.construct_instance( + ["TEXT_TO_OBTAIN_EMBEDDINGS_DIMENSION"], + embedding=self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key), + collection_name=settings.QDRANT_COLLECTION_NAME, + location=settings.QDRANT_LOCATION, + url=settings.QDRANT_URL, + port=settings.QDRANT_PORT, + grpc_port=settings.QDRANT_GRPC_PORT, + https=settings.QDRANT_HTTPS, + prefer_grpc=settings.QDRANT_PREFER_GRPC, + api_key=settings.QDRANT_API_KEY, + prefix=settings.QDRANT_PREFIX, + timeout=settings.QDRANT_TIMEOUT, + path=settings.QDRANT_PATH, + distance_func=settings.QDRANT_DISTANCE_FUNC, + ) + + def search(self, *args, **kwargs): + return self._docsearch.similarity_search(filter=self._filter, *args, **kwargs) + + def add_texts(self, *args, **kwargs): + return self._docsearch.add_texts(*args, **kwargs) + + def save_local(self, *args, **kwargs): + pass + + def delete_index(self, *args, **kwargs): + return self._docsearch.client.delete( + collection_name=settings.QDRANT_COLLECTION_NAME, points_selector=self._filter + ) diff --git a/application/vectorstore/vector_creator.py b/application/vectorstore/vector_creator.py index 68ae2813..27b38645 100644 --- a/application/vectorstore/vector_creator.py +++ b/application/vectorstore/vector_creator.py @@ -1,13 +1,15 @@ from application.vectorstore.faiss import FaissStore from application.vectorstore.elasticsearch import ElasticsearchStore from application.vectorstore.mongodb import MongoDBVectorStore +from application.vectorstore.qdrant import QdrantStore class VectorCreator: vectorstores = { - 'faiss': FaissStore, - 'elasticsearch':ElasticsearchStore, - 'mongodb': MongoDBVectorStore, + "faiss": FaissStore, + "elasticsearch": ElasticsearchStore, + "mongodb": MongoDBVectorStore, + "qdrant": QdrantStore, } @classmethod @@ -15,4 +17,4 @@ class VectorCreator: vectorstore_class = cls.vectorstores.get(type.lower()) if not vectorstore_class: raise ValueError(f"No vectorstore class found for type {type}") - return vectorstore_class(*args, **kwargs) \ No newline at end of file + return vectorstore_class(*args, **kwargs) From 00dfb07b15602319bddb95089e3dab05fac56240 Mon Sep 17 00:00:00 2001 From: Anush008 Date: Thu, 29 Feb 2024 09:48:38 +0530 Subject: [PATCH 2/2] chore: revert to faiss default --- application/core/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/core/settings.py b/application/core/settings.py index cd1ac047..84073b7d 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -16,7 +16,7 @@ class Settings(BaseSettings): MODEL_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf") TOKENS_MAX_HISTORY: int = 150 UPLOAD_FOLDER: str = "inputs" - VECTOR_STORE: str = "qdrant" # "faiss" or "elasticsearch" or "qdrant" + VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" API_URL: str = "http://localhost:7091" # backend url for celery worker