You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/vectorstores/vearch.py

578 lines
19 KiB
Python

from __future__ import annotations
import os
import time
import uuid
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
if TYPE_CHECKING:
import vearch
DEFAULT_TOPN = 4
class Vearch(VectorStore):
_DEFAULT_TABLE_NAME = "langchain_vearch"
_DEFAULT_CLUSTER_DB_NAME = "cluster_client_db"
_DEFAULT_VERSION = 1
def __init__(
self,
embedding_function: Embeddings,
path_or_url: Optional[str] = None,
table_name: str = _DEFAULT_TABLE_NAME,
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
flag: int = _DEFAULT_VERSION,
**kwargs: Any,
) -> None:
"""Initialize vearch vector store
flag 1 for cluster,0 for standalone
"""
try:
if flag:
import vearch_cluster
else:
import vearch
except ImportError:
raise ValueError(
"Could not import suitable python package. "
"Please install it with `pip install vearch or vearch_cluster`."
)
if flag:
if path_or_url is None:
raise ValueError("Please input url of cluster")
if not db_name:
db_name = self._DEFAULT_CLUSTER_DB_NAME
db_name += "_"
db_name += str(uuid.uuid4()).split("-")[-1]
self.using_db_name = db_name
self.url = path_or_url
self.vearch = vearch_cluster.VearchCluster(path_or_url)
else:
if path_or_url is None:
metadata_path = os.getcwd().replace("\\", "/")
else:
metadata_path = path_or_url
if not os.path.isdir(metadata_path):
os.makedirs(metadata_path)
log_path = os.path.join(metadata_path, "log")
if not os.path.isdir(log_path):
os.makedirs(log_path)
self.vearch = vearch.Engine(metadata_path, log_path)
self.using_metapath = metadata_path
if not table_name:
table_name = self._DEFAULT_TABLE_NAME
table_name += "_"
table_name += str(uuid.uuid4()).split("-")[-1]
self.using_table_name = table_name
self.embedding_func = embedding_function
self.flag = flag
@property
def embeddings(self) -> Optional[Embeddings]:
return self.embedding_func
@classmethod
def from_documents(
cls: Type[Vearch],
documents: List[Document],
embedding: Embeddings,
path_or_url: Optional[str] = None,
table_name: str = _DEFAULT_TABLE_NAME,
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
flag: int = _DEFAULT_VERSION,
**kwargs: Any,
) -> Vearch:
"""Return Vearch VectorStore"""
texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]
return cls.from_texts(
texts=texts,
embedding=embedding,
metadatas=metadatas,
path_or_url=path_or_url,
table_name=table_name,
db_name=db_name,
flag=flag,
**kwargs,
)
@classmethod
def from_texts(
cls: Type[Vearch],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
path_or_url: Optional[str] = None,
table_name: str = _DEFAULT_TABLE_NAME,
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
flag: int = _DEFAULT_VERSION,
**kwargs: Any,
) -> Vearch:
"""Return Vearch VectorStore"""
vearch_db = cls(
embedding_function=embedding,
embedding=embedding,
path_or_url=path_or_url,
db_name=db_name,
table_name=table_name,
flag=flag,
)
vearch_db.add_texts(texts=texts, metadatas=metadatas)
return vearch_db
def _create_table(
self,
dim: int = 1024,
field_list: List[dict] = [
{"field": "text", "type": "str"},
{"field": "metadata", "type": "str"},
],
) -> int:
"""
Create VectorStore Table
Args:
dim:dimension of vector
fields_list: the field you want to store
Return:
code,0 for success,1 for failed
"""
type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING}
engine_info = {
"index_size": 10000,
"retrieval_type": "IVFPQ",
"retrieval_param": {"ncentroids": 2048, "nsubvector": 32},
}
fields = [
vearch.GammaFieldInfo(fi["field"], type_dict[fi["type"]])
for fi in field_list
]
vector_field = vearch.GammaVectorInfo(
name="text_embedding",
type=vearch.dataType.VECTOR,
is_index=True,
dimension=dim,
model_id="",
store_type="MemoryOnly",
store_param={"cache_size": 10000},
has_source=False,
)
response_code = self.vearch.create_table(
engine_info,
name=self.using_table_name,
fields=fields,
vector_field=vector_field,
)
return response_code
def _create_space(
self,
dim: int = 1024,
) -> int:
"""
Create VectorStore space
Args:
dim:dimension of vector
Return:
code,0 failed for ,1 for success
"""
space_config = {
"name": self.using_table_name,
"partition_num": 1,
"replica_num": 1,
"engine": {
"name": "gamma",
"index_size": 1,
"retrieval_type": "FLAT",
"retrieval_param": {
"metric_type": "L2",
},
},
"properties": {
"text": {
"type": "string",
},
"metadata": {
"type": "string",
},
"text_embedding": {
"type": "vector",
"index": True,
"dimension": dim,
"store_type": "MemoryOnly",
},
},
}
response_code = self.vearch.create_space(self.using_db_name, space_config)
return response_code
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""
Returns:
List of ids from adding the texts into the vectorstore.
"""
embeddings = None
if self.embedding_func is not None:
embeddings = self.embedding_func.embed_documents(list(texts))
if embeddings is None:
raise ValueError("embeddings is None")
if self.flag:
dbs_list = self.vearch.list_dbs()
if self.using_db_name not in dbs_list:
create_db_code = self.vearch.create_db(self.using_db_name)
if not create_db_code:
raise ValueError("create db failed!!!")
space_list = self.vearch.list_spaces(self.using_db_name)
if self.using_table_name not in space_list:
create_space_code = self._create_space(len(embeddings[0]))
if not create_space_code:
raise ValueError("create space failed!!!")
docid = []
if embeddings is not None and metadatas is not None:
for text, metadata, embed in zip(texts, metadatas, embeddings):
profiles: dict[str, Any] = {}
profiles["text"] = text
profiles["metadata"] = metadata["source"]
embed_np = np.array(embed)
profiles["text_embedding"] = {
"feature": (embed_np / np.linalg.norm(embed_np)).tolist()
}
insert_res = self.vearch.insert_one(
self.using_db_name, self.using_table_name, profiles
)
if insert_res["status"] == 200:
docid.append(insert_res["_id"])
continue
else:
retry_insert = self.vearch.insert_one(
self.using_db_name, self.using_table_name, profiles
)
docid.append(retry_insert["_id"])
continue
else:
table_path = os.path.join(
self.using_metapath, self.using_table_name + ".schema"
)
if not os.path.exists(table_path):
dim = len(embeddings[0])
response_code = self._create_table(dim)
if response_code:
raise ValueError("create table failed!!!")
if embeddings is not None and metadatas is not None:
doc_items = []
for text, metadata, embed in zip(texts, metadatas, embeddings):
profiles_v: dict[str, Any] = {}
profiles_v["text"] = text
profiles_v["metadata"] = metadata["source"]
embed_np = np.array(embed)
profiles_v["text_embedding"] = embed_np / np.linalg.norm(embed_np)
doc_items.append(profiles_v)
docid = self.vearch.add(doc_items)
t_time = 0
while len(docid) != len(embeddings):
time.sleep(0.5)
if t_time > 6:
break
t_time += 1
self.vearch.dump()
return docid
def _load(self) -> None:
"""
load vearch engine for standalone vearch
"""
self.vearch.load()
@classmethod
def load_local(
cls,
embedding: Embeddings,
path_or_url: Optional[str] = None,
table_name: str = _DEFAULT_TABLE_NAME,
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
flag: int = _DEFAULT_VERSION,
**kwargs: Any,
) -> Vearch:
"""Load the local specified table of standalone vearch.
Returns:
Success or failure of loading the local specified table
"""
if not path_or_url:
raise ValueError("No metadata path!!!")
if not table_name:
raise ValueError("No table name!!!")
table_path = os.path.join(path_or_url, table_name + ".schema")
if not os.path.exists(table_path):
raise ValueError("vearch vectorbase table not exist!!!")
vearch_db = cls(
embedding_function=embedding,
path_or_url=path_or_url,
table_name=table_name,
db_name=db_name,
flag=flag,
)
vearch_db._load()
return vearch_db
def similarity_search(
self,
query: str,
k: int = DEFAULT_TOPN,
**kwargs: Any,
) -> List[Document]:
"""
Return docs most similar to query.
"""
if self.embedding_func is None:
raise ValueError("embedding_func is None!!!")
embeddings = self.embedding_func.embed_query(query)
docs = self.similarity_search_by_vector(embeddings, k)
return docs
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = DEFAULT_TOPN,
**kwargs: Any,
) -> List[Document]:
"""The most k similar documents and scores of the specified query.
Args:
embeddings: embedding vector of the query.
k: The k most similar documents to the text query.
min_score: the score of similar documents to the text query
Returns:
The k most similar documents to the specified text query.
0 is dissimilar, 1 is the most similar.
"""
embed = np.array(embedding)
if self.flag:
query_data = {
"query": {
"sum": [
{
"field": "text_embedding",
"feature": (embed / np.linalg.norm(embed)).tolist(),
}
],
},
"size": k,
"fields": ["text", "metadata"],
}
query_result = self.vearch.search(
self.using_db_name, self.using_table_name, query_data
)
res = query_result["hits"]["hits"]
else:
query_data = {
"vector": [
{
"field": "text_embedding",
"feature": embed / np.linalg.norm(embed),
}
],
"fields": [],
"is_brute_search": 1,
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
"topn": k,
}
query_result = self.vearch.search(query_data)
res = query_result[0]["result_items"]
docs = []
for item in res:
content = ""
meta_data = {}
if self.flag:
item = item["_source"]
for item_key in item:
if item_key == "text":
content = item[item_key]
continue
if item_key == "metadata":
meta_data["source"] = item[item_key]
continue
docs.append(Document(page_content=content, metadata=meta_data))
return docs
def similarity_search_with_score(
self,
query: str,
k: int = DEFAULT_TOPN,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""The most k similar documents and scores of the specified query.
Args:
embeddings: embedding vector of the query.
k: The k most similar documents to the text query.
min_score: the score of similar documents to the text query
Returns:
The k most similar documents to the specified text query.
0 is dissimilar, 1 is the most similar.
"""
if self.embedding_func is None:
raise ValueError("embedding_func is None!!!")
embeddings = self.embedding_func.embed_query(query)
embed = np.array(embeddings)
if self.flag:
query_data = {
"query": {
"sum": [
{
"field": "text_embedding",
"feature": (embed / np.linalg.norm(embed)).tolist(),
}
],
},
"size": k,
"fields": ["text_embedding", "text", "metadata"],
}
query_result = self.vearch.search(
self.using_db_name, self.using_table_name, query_data
)
res = query_result["hits"]["hits"]
else:
query_data = {
"vector": [
{
"field": "text_embedding",
"feature": embed / np.linalg.norm(embed),
}
],
"fields": [],
"is_brute_search": 1,
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
"topn": k,
}
query_result = self.vearch.search(query_data)
res = query_result[0]["result_items"]
results: List[Tuple[Document, float]] = []
for item in res:
content = ""
meta_data = {}
if self.flag:
score = item["_score"]
item = item["_source"]
for item_key in item:
if item_key == "text":
content = item[item_key]
continue
if item_key == "metadata":
meta_data["source"] = item[item_key]
continue
if self.flag != 1 and item_key == "score":
score = item[item_key]
continue
tmp_res = (Document(page_content=content, metadata=meta_data), score)
results.append(tmp_res)
return results
def _similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
return self.similarity_search_with_score(query, k, **kwargs)
def delete(
self,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> Optional[bool]:
"""Delete the documents which have the specified ids.
Args:
ids: The ids of the embedding vectors.
**kwargs: Other keyword arguments that subclasses might use.
Returns:
Optional[bool]: True if deletion is successful.
False otherwise, None if not implemented.
"""
ret: Optional[bool] = None
tmp_res = []
if ids is None or ids.__len__() == 0:
return ret
for _id in ids:
if self.flag:
ret = self.vearch.delete(self.using_db_name, self.using_table_name, _id)
else:
ret = self.vearch.del_doc(_id)
tmp_res.append(ret)
ret = all(i == 0 for i in tmp_res)
return ret
def get(
self,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> Dict[str, Document]:
"""Return docs according ids.
Args:
ids: The ids of the embedding vectors.
Returns:
Documents which satisfy the input conditions.
"""
results: Dict[str, Document] = {}
if ids is None or ids.__len__() == 0:
return results
if self.flag:
query_data = {"query": {"ids": ids}}
docs_detail = self.vearch.mget_by_ids(
self.using_db_name, self.using_table_name, query_data
)
for record in docs_detail:
if record["found"] is False:
continue
content = ""
meta_info = {}
for field in record["_source"]:
if field == "text":
content = record["_source"][field]
continue
elif field == "metadata":
meta_info["source"] = record["_source"][field]
continue
results[record["_id"]] = Document(
page_content=content, metadata=meta_info
)
else:
for id in ids:
docs_detail = self.vearch.get_doc_by_id(id)
if docs_detail == {}:
continue
content = ""
meta_info = {}
for field in docs_detail:
if field == "text":
content = docs_detail[field]
continue
elif field == "metadata":
meta_info["source"] = docs_detail[field]
continue
results[docs_detail["_id"]] = Document(
page_content=content, metadata=meta_info
)
return results