DocsGPT/application/vectorstore/elasticsearch.py

222 lines
7.1 KiB
Python
Raw Normal View History

2023-09-28 23:32:19 +00:00
from application.vectorstore.base import BaseVectorStore
from application.core.settings import settings
import elasticsearch
2023-09-29 16:17:48 +00:00
class Document(str):
"""Class for storing a piece of text and associated metadata."""
2023-09-30 14:25:31 +00:00
def __new__(cls, page_content: str, metadata: dict):
instance = super().__new__(cls, page_content)
instance.page_content = page_content
instance.metadata = metadata
return instance
2023-09-28 23:32:19 +00:00
class ElasticsearchStore(BaseVectorStore):
_es_connection = None # Class attribute to hold the Elasticsearch connection
2023-09-29 16:17:48 +00:00
def __init__(self, path, embeddings_key, index_name=settings.ELASTIC_INDEX):
2023-09-28 23:32:19 +00:00
super().__init__()
2023-09-30 14:25:31 +00:00
self.path = path.replace("application/indexes/", "").rstrip("/")
2023-09-28 23:32:19 +00:00
self.embeddings_key = embeddings_key
self.index_name = index_name
if ElasticsearchStore._es_connection is None:
connection_params = {}
2023-09-29 16:17:48 +00:00
if settings.ELASTIC_URL:
connection_params["hosts"] = [settings.ELASTIC_URL]
connection_params["http_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
elif settings.ELASTIC_CLOUD_ID:
connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID
connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD)
else:
raise ValueError("Please provide either elasticsearch_url or cloud_id.")
2023-09-28 23:32:19 +00:00
ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params)
self.docsearch = ElasticsearchStore._es_connection
def connect_to_elasticsearch(
*,
es_url = None,
cloud_id = None,
api_key = None,
username = None,
password = None,
):
try:
import elasticsearch
except ImportError:
raise ImportError(
"Could not import elasticsearch python package. "
"Please install it with `pip install elasticsearch`."
)
if es_url and cloud_id:
raise ValueError(
"Both es_url and cloud_id are defined. Please provide only one."
)
connection_params = {}
if es_url:
connection_params["hosts"] = [es_url]
elif cloud_id:
connection_params["cloud_id"] = cloud_id
else:
raise ValueError("Please provide either elasticsearch_url or cloud_id.")
if api_key:
connection_params["api_key"] = api_key
elif username and password:
connection_params["basic_auth"] = (username, password)
es_client = elasticsearch.Elasticsearch(
**connection_params,
)
try:
es_client.info()
except Exception as e:
raise e
return es_client
def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs):
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
vector = embeddings.embed_query(question)
knn = {
2023-09-30 14:25:31 +00:00
"filter": [{"match": {"metadata.store.keyword": self.path}}],
2023-09-28 23:32:19 +00:00
"field": "vector",
"k": k,
"num_candidates": 100,
"query_vector": vector,
}
full_query = {
"knn": knn,
"query": {
"bool": {
"must": [
{
"match": {
"text": {
"query": question,
}
}
}
],
2023-09-30 14:25:31 +00:00
"filter": [{"match": {"metadata.store.keyword": self.path}}],
2023-09-28 23:32:19 +00:00
}
},
"rank": {"rrf": {}},
}
2023-09-29 16:17:48 +00:00
resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn'])
# create Documnets objects from the results page_content ['_source']['text'], metadata ['_source']['metadata']
doc_list = []
for hit in resp['hits']['hits']:
doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata']))
return doc_list
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
def _create_index_if_not_exists(
self, index_name, dims_length
):
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
if self._es_connection.indices.exists(index=index_name):
print(f"Index {index_name} already exists.")
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
else:
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
indexSettings = self.index(
dims_length=dims_length,
)
self._es_connection.indices.create(index=index_name, **indexSettings)
def index(
self,
dims_length,
):
return {
"mappings": {
"properties": {
"vector": {
"type": "dense_vector",
"dims": dims_length,
"index": True,
"similarity": "cosine",
},
2023-09-28 23:32:19 +00:00
}
}
2023-09-29 16:17:48 +00:00
}
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
def add_texts(
self,
texts,
metadatas = None,
ids = None,
refresh_indices = True,
create_index_if_not_exists = True,
bulk_kwargs = None,
**kwargs,
2023-09-28 23:32:19 +00:00
):
2023-09-29 16:17:48 +00:00
from elasticsearch.helpers import BulkIndexError, bulk
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
bulk_kwargs = bulk_kwargs or {}
import uuid
embeddings = []
ids = ids or [str(uuid.uuid4()) for _ in texts]
requests = []
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
vectors = embeddings.embed_documents(list(texts))
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
dims_length = len(vectors[0])
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
if create_index_if_not_exists:
self._create_index_if_not_exists(
index_name=self.index_name, dims_length=dims_length
)
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
for i, (text, vector) in enumerate(zip(texts, vectors)):
metadata = metadatas[i] if metadatas else {}
requests.append(
{
"_op_type": "index",
"_index": self.index_name,
"text": text,
"vector": vector,
"metadata": metadata,
"_id": ids[i],
}
)
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
if len(requests) > 0:
try:
success, failed = bulk(
self._es_connection,
requests,
stats_only=True,
refresh=refresh_indices,
**bulk_kwargs,
)
return ids
except BulkIndexError as e:
print(f"Error adding texts: {e}")
firstError = e.errors[0].get("index", {}).get("error", {})
print(f"First error reason: {firstError.get('reason')}")
raise e
2023-09-28 23:32:19 +00:00
2023-09-29 16:17:48 +00:00
else:
return []
def delete_index(self):
self._es_connection.delete_by_query(index=self.index_name, query={"match": {
2023-09-30 14:25:31 +00:00
"metadata.store.keyword": self.path}},)
2023-09-28 23:32:19 +00:00