Merge pull request #354 from arc53/featue/elasticsearch

working es
This commit is contained in:
Pavel 2023-09-30 17:37:37 +03:00 committed by GitHub
commit b86c294250
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 360 additions and 71 deletions

View File

@ -13,8 +13,8 @@ from transformers import GPT2TokenizerFast
from application.core.settings import settings
from application.vectorstore.vector_creator import VectorCreator
from application.llm.llm_creator import LLMCreator
from application.vectorstore.faiss import FaissStore
from application.error import bad_request
@ -226,7 +226,7 @@ def stream():
vectorstore = get_vectorstore({"active_docs": data["active_docs"]})
else:
vectorstore = ""
docsearch = FaissStore(vectorstore, embeddings_key)
docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
return Response(
complete_stream(question, docsearch,
@ -260,7 +260,7 @@ def api_answer():
vectorstore = get_vectorstore(data)
# loading the index and the store and the prompt template
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
docsearch = FaissStore(vectorstore, embeddings_key)
docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)
llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key)

View File

@ -34,6 +34,8 @@ def upload_index_files():
if "name" not in request.form:
return {"status": "no name"}
job_name = secure_filename(request.form["name"])
save_dir = os.path.join(current_dir, "indexes", user, job_name)
if settings.VECTOR_STORE == "faiss":
if "file_faiss" not in request.files:
print("No file part")
return {"status": "no file"}
@ -46,9 +48,8 @@ def upload_index_files():
file_pkl = request.files["file_pkl"]
if file_pkl.filename == "":
return {"status": "no file name"}
# saves index files
save_dir = os.path.join(current_dir, "indexes", user, job_name)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
file_faiss.save(os.path.join(save_dir, "index.faiss"))

View File

@ -11,6 +11,8 @@ from celery.result import AsyncResult
from application.api.user.tasks import ingest
from application.core.settings import settings
from application.vectorstore.vector_creator import VectorCreator
mongo = MongoClient(settings.MONGO_URI)
db = mongo["docsgpt"]
conversations_collection = db["conversations"]
@ -90,10 +92,17 @@ def delete_old():
return {"status": "error"}
path_clean = "/".join(dirs)
vectors_collection.delete_one({"location": path})
if settings.VECTOR_STORE == "faiss":
try:
shutil.rmtree(path_clean)
shutil.rmtree(os.path.join(current_dir, path_clean))
except FileNotFoundError:
pass
else:
vetorstore = VectorCreator.create_vectorstore(
settings.VECTOR_STORE, path=os.path.join(current_dir, path_clean)
)
vetorstore.delete_index()
return {"status": "ok"}
@user.route("/api/upload", methods=["POST"])
@ -173,7 +182,7 @@ def combined_json():
"location": "local",
}
)
if settings.VECTOR_STORE == "faiss":
data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
for index in data_remote:
index["location"] = "remote"

View File

@ -13,6 +13,7 @@ class Settings(BaseSettings):
TOKENS_MAX_HISTORY: int = 150
SELF_HOSTED_MODEL: bool = False
UPLOAD_FOLDER: str = "inputs"
VECTOR_STORE: str = "elasticsearch" # "faiss" or "elasticsearch"
API_URL: str = "http://localhost:7091" # backend url for celery worker
@ -23,6 +24,13 @@ class Settings(BaseSettings):
AZURE_DEPLOYMENT_NAME: str = None # azure deployment name for answering
AZURE_EMBEDDINGS_DEPLOYMENT_NAME: str = None # azure deployment name for embeddings
# elasticsearch
ELASTIC_CLOUD_ID: str = None # cloud id for elasticsearch
ELASTIC_USERNAME: str = None # username for elasticsearch
ELASTIC_PASSWORD: str = None # password for elasticsearch
ELASTIC_URL: str = None # url for elasticsearch
ELASTIC_INDEX: str = "docsgpt" # index name for elasticsearch
path = Path(__file__).parent.parent.absolute()
settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8")

View File

@ -1,8 +1,8 @@
import os
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from application.vectorstore.vector_creator import VectorCreator
from application.core.settings import settings
from retry import retry
@ -33,12 +33,23 @@ def call_openai_api(docs, folder_name, task_status):
os.makedirs(f"{folder_name}")
from tqdm import tqdm
docs_test = [docs[0]]
docs.pop(0)
c1 = 0
if settings.VECTOR_STORE == "faiss":
docs_init = [docs[0]]
docs.pop(0)
store = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=os.getenv("EMBEDDINGS_KEY")))
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
docs_init = docs_init,
path=f"{folder_name}",
embeddings_key=os.getenv("EMBEDDINGS_KEY")
)
else:
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
path=f"{folder_name}",
embeddings_key=os.getenv("EMBEDDINGS_KEY")
)
# Uncomment for MPNet embeddings
# model_name = "sentence-transformers/all-mpnet-base-v2"
# hf = HuggingFaceEmbeddings(model_name=model_name)
@ -57,6 +68,7 @@ def call_openai_api(docs, folder_name, task_status):
store.save_local(f"{folder_name}")
break
c1 += 1
if settings.VECTOR_STORE == "faiss":
store.save_local(f"{folder_name}")

View File

@ -22,6 +22,7 @@ decorator==5.1.1
dill==0.3.6
dnspython==2.3.0
ecdsa==0.18.0
elasticsearch==8.9.0
entrypoints==0.4
faiss-cpu==1.7.3
filelock==3.9.0

View File

@ -19,7 +19,7 @@ class BaseVectorStore(ABC):
def is_azure_configured(self):
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME
def _get_docsearch(self, embeddings_name, embeddings_key=None):
def _get_embeddings(self, embeddings_name, embeddings_key=None):
embeddings_factory = {
"openai_text-embedding-ada-002": OpenAIEmbeddings,
"huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceHubEmbeddings,

View File

@ -0,0 +1,221 @@
from application.vectorstore.base import BaseVectorStore
from application.core.settings import settings
import elasticsearch
class Document(str):
"""Class for storing a piece of text and associated metadata."""
def __new__(cls, page_content: str, metadata: dict):
instance = super().__new__(cls, page_content)
instance.page_content = page_content
instance.metadata = metadata
return instance
class ElasticsearchStore(BaseVectorStore):
_es_connection = None # Class attribute to hold the Elasticsearch connection
def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX):
super().__init__()
self.path = path.replace("application/indexes/", "").rstrip("/")
self.embeddings_key = embeddings_key
self.index_name = index_name
if ElasticsearchStore._es_connection is None:
connection_params = {}
if settings.ELASTIC_URL:
connection_params["hosts"] = [settings.ELASTIC_URL]
connection_params["http_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
elif settings.ELASTIC_CLOUD_ID:
connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID
connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
else:
raise ValueError("Please provide either elasticsearch_url or cloud_id.")
ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params)
self.docsearch = ElasticsearchStore._es_connection
def connect_to_elasticsearch(
*,
es_url = None,
cloud_id = None,
api_key = None,
username = None,
password = None,
):
try:
import elasticsearch
except ImportError:
raise ImportError(
"Could not import elasticsearch python package. "
"Please install it with `pip install elasticsearch`."
)
if es_url and cloud_id:
raise ValueError(
"Both es_url and cloud_id are defined. Please provide only one."
)
connection_params = {}
if es_url:
connection_params["hosts"] = [es_url]
elif cloud_id:
connection_params["cloud_id"] = cloud_id
else:
raise ValueError("Please provide either elasticsearch_url or cloud_id.")
if api_key:
connection_params["api_key"] = api_key
elif username and password:
connection_params["basic_auth"] = (username, password)
es_client = elasticsearch.Elasticsearch(
**connection_params,
)
try:
es_client.info()
except Exception as e:
raise e
return es_client
def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs):
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
vector = embeddings.embed_query(question)
knn = {
"filter": [{"match": {"metadata.store.keyword": self.path}}],
"field": "vector",
"k": k,
"num_candidates": 100,
"query_vector": vector,
}
full_query = {
"knn": knn,
"query": {
"bool": {
"must": [
{
"match": {
"text": {
"query": question,
}
}
}
],
"filter": [{"match": {"metadata.store.keyword": self.path}}],
}
},
"rank": {"rrf": {}},
}
resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn'])
# create Documnets objects from the results page_content ['_source']['text'], metadata ['_source']['metadata']
doc_list = []
for hit in resp['hits']['hits']:
doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata']))
return doc_list
def _create_index_if_not_exists(
self, index_name, dims_length
):
if self._es_connection.indices.exists(index=index_name):
print(f"Index {index_name} already exists.")
else:
indexSettings = self.index(
dims_length=dims_length,
)
self._es_connection.indices.create(index=index_name, **indexSettings)
def index(
self,
dims_length,
):
return {
"mappings": {
"properties": {
"vector": {
"type": "dense_vector",
"dims": dims_length,
"index": True,
"similarity": "cosine",
},
}
}
}
def add_texts(
self,
texts,
metadatas = None,
ids = None,
refresh_indices = True,
create_index_if_not_exists = True,
bulk_kwargs = None,
**kwargs,
):
from elasticsearch.helpers import BulkIndexError, bulk
bulk_kwargs = bulk_kwargs or {}
import uuid
embeddings = []
ids = ids or [str(uuid.uuid4()) for _ in texts]
requests = []
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
vectors = embeddings.embed_documents(list(texts))
dims_length = len(vectors[0])
if create_index_if_not_exists:
self._create_index_if_not_exists(
index_name=self.index_name, dims_length=dims_length
)
for i, (text, vector) in enumerate(zip(texts, vectors)):
metadata = metadatas[i] if metadatas else {}
requests.append(
{
"_op_type": "index",
"_index": self.index_name,
"text": text,
"vector": vector,
"metadata": metadata,
"_id": ids[i],
}
)
if len(requests) > 0:
try:
success, failed = bulk(
self._es_connection,
requests,
stats_only=True,
refresh=refresh_indices,
**bulk_kwargs,
)
return ids
except BulkIndexError as e:
print(f"Error adding texts: {e}")
firstError = e.errors[0].get("index", {}).get("error", {})
print(f"First error reason: {firstError.get('reason')}")
raise e
else:
return []
def delete_index(self):
self._es_connection.delete_by_query(index=self.index_name, query={"match": {
"metadata.store.keyword": self.path}},)

View File

@ -4,12 +4,23 @@ from application.core.settings import settings
class FaissStore(BaseVectorStore):
def __init__(self, path, embeddings_key):
def __init__(self, path, embeddings_key, docs_init=None):
super().__init__()
self.path = path
if docs_init:
self.docsearch = FAISS.from_documents(
docs_init, self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key)
)
else:
self.docsearch = FAISS.load_local(
self.path, self._get_docsearch(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY)
self.path, self._get_embeddings(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY)
)
def search(self, *args, **kwargs):
return self.docsearch.similarity_search(*args, **kwargs)
def add_texts(self, *args, **kwargs):
return self.docsearch.add_texts(*args, **kwargs)
def save_local(self, *args, **kwargs):
return self.docsearch.save_local(*args, **kwargs)

View File

@ -0,0 +1,16 @@
from application.vectorstore.faiss import FaissStore
from application.vectorstore.elasticsearch import ElasticsearchStore
class VectorCreator:
vectorstores = {
'faiss': FaissStore,
'elasticsearch':ElasticsearchStore
}
@classmethod
def create_vectorstore(cls, type, *args, **kwargs):
vectorstore_class = cls.vectorstores.get(type.lower())
if not vectorstore_class:
raise ValueError(f"No vectorstore class found for type {type}")
return vectorstore_class(*args, **kwargs)

View File

@ -21,7 +21,9 @@ except FileExistsError:
def metadata_from_filename(title):
return {'title': title}
store = title.split('/')
store = store[1] + '/' + store[2]
return {'title': title, 'store': store}
def generate_random_string(length):
@ -83,11 +85,15 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
# and send them to the server (provide user and name in form)
file_data = {'name': name_job, 'user': user}
if settings.VECTOR_STORE == "faiss":
files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
'file_pkl': open(full_path + '/index.pkl', 'rb')}
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
else:
response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path="))
# delete local
shutil.rmtree(full_path)

View File

@ -1346,9 +1346,9 @@
}
},
"node_modules/@typescript-eslint/eslint-plugin/node_modules/semver": {
"version": "7.3.8",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
"version": "7.5.4",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
"dev": true,
"dependencies": {
"lru-cache": "^6.0.0"
@ -1490,9 +1490,9 @@
}
},
"node_modules/@typescript-eslint/typescript-estree/node_modules/semver": {
"version": "7.3.8",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
"version": "7.5.4",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
"dev": true,
"dependencies": {
"lru-cache": "^6.0.0"
@ -1549,9 +1549,9 @@
}
},
"node_modules/@typescript-eslint/utils/node_modules/semver": {
"version": "7.3.8",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
"version": "7.5.4",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
"dev": true,
"dependencies": {
"lru-cache": "^6.0.0"
@ -1991,9 +1991,9 @@
}
},
"node_modules/builtins/node_modules/semver": {
"version": "7.3.8",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
"version": "7.5.4",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
"dev": true,
"dependencies": {
"lru-cache": "^6.0.0"
@ -2055,9 +2055,9 @@
}
},
"node_modules/caniuse-lite": {
"version": "1.0.30001450",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001450.tgz",
"integrity": "sha512-qMBmvmQmFXaSxexkjjfMvD5rnDL0+m+dUMZKoDYsGG8iZN29RuYh9eRoMvKsT6uMAWlyUUGDEQGJJYjzCIO9ew==",
"version": "1.0.30001541",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001541.tgz",
"integrity": "sha512-bLOsqxDgTqUBkzxbNlSBt8annkDpQB9NdzdTbO2ooJ+eC/IQcvDspDc058g84ejCelF7vHUx57KIOjEecOHXaw==",
"dev": true,
"funding": [
{
@ -2067,6 +2067,10 @@
{
"type": "tidelift",
"url": "https://tidelift.com/funding/github/npm/caniuse-lite"
},
{
"type": "github",
"url": "https://github.com/sponsors/ai"
}
]
},
@ -2889,9 +2893,9 @@
}
},
"node_modules/eslint-plugin-n/node_modules/semver": {
"version": "7.3.8",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
"integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
"version": "7.5.4",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
"integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
"dev": true,
"dependencies": {
"lru-cache": "^6.0.0"
@ -4478,9 +4482,9 @@
}
},
"node_modules/lint-staged/node_modules/yaml": {
"version": "2.2.1",
"resolved": "https://registry.npmjs.org/yaml/-/yaml-2.2.1.tgz",
"integrity": "sha512-e0WHiYql7+9wr4cWMx3TVQrNwejKaEe7/rHNmQmqRjazfOP5W8PB6Jpebb5o6fIapbz9o9+2ipcaTM2ZwDI6lw==",
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/yaml/-/yaml-2.3.2.tgz",
"integrity": "sha512-N/lyzTPaJasoDmfV7YTrYCI0G/3ivm/9wdG0aHuheKowWQwGTsK0Eoiw6utmzAnI6pkJa0DUVygvp3spqqEKXg==",
"dev": true,
"engines": {
"node": ">= 14"
@ -6532,9 +6536,9 @@
}
},
"node_modules/semver": {
"version": "6.3.0",
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
"integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
"version": "6.3.1",
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
"integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
"dev": true,
"bin": {
"semver": "bin/semver.js"