mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
578 lines
19 KiB
Python
578 lines
19 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
import os
|
||
|
import time
|
||
|
import uuid
|
||
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type
|
||
|
|
||
|
import numpy as np
|
||
|
from langchain_core.documents import Document
|
||
|
from langchain_core.embeddings import Embeddings
|
||
|
from langchain_core.vectorstores import VectorStore
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
import vearch
|
||
|
|
||
|
DEFAULT_TOPN = 4
|
||
|
|
||
|
|
||
|
class Vearch(VectorStore):
|
||
|
_DEFAULT_TABLE_NAME = "langchain_vearch"
|
||
|
_DEFAULT_CLUSTER_DB_NAME = "cluster_client_db"
|
||
|
_DEFAULT_VERSION = 1
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
embedding_function: Embeddings,
|
||
|
path_or_url: Optional[str] = None,
|
||
|
table_name: str = _DEFAULT_TABLE_NAME,
|
||
|
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
|
||
|
flag: int = _DEFAULT_VERSION,
|
||
|
**kwargs: Any,
|
||
|
) -> None:
|
||
|
"""Initialize vearch vector store
|
||
|
flag 1 for cluster,0 for standalone
|
||
|
"""
|
||
|
try:
|
||
|
if flag:
|
||
|
import vearch_cluster
|
||
|
else:
|
||
|
import vearch
|
||
|
except ImportError:
|
||
|
raise ValueError(
|
||
|
"Could not import suitable python package. "
|
||
|
"Please install it with `pip install vearch or vearch_cluster`."
|
||
|
)
|
||
|
|
||
|
if flag:
|
||
|
if path_or_url is None:
|
||
|
raise ValueError("Please input url of cluster")
|
||
|
if not db_name:
|
||
|
db_name = self._DEFAULT_CLUSTER_DB_NAME
|
||
|
db_name += "_"
|
||
|
db_name += str(uuid.uuid4()).split("-")[-1]
|
||
|
self.using_db_name = db_name
|
||
|
self.url = path_or_url
|
||
|
self.vearch = vearch_cluster.VearchCluster(path_or_url)
|
||
|
|
||
|
else:
|
||
|
if path_or_url is None:
|
||
|
metadata_path = os.getcwd().replace("\\", "/")
|
||
|
else:
|
||
|
metadata_path = path_or_url
|
||
|
if not os.path.isdir(metadata_path):
|
||
|
os.makedirs(metadata_path)
|
||
|
log_path = os.path.join(metadata_path, "log")
|
||
|
if not os.path.isdir(log_path):
|
||
|
os.makedirs(log_path)
|
||
|
self.vearch = vearch.Engine(metadata_path, log_path)
|
||
|
self.using_metapath = metadata_path
|
||
|
if not table_name:
|
||
|
table_name = self._DEFAULT_TABLE_NAME
|
||
|
table_name += "_"
|
||
|
table_name += str(uuid.uuid4()).split("-")[-1]
|
||
|
self.using_table_name = table_name
|
||
|
self.embedding_func = embedding_function
|
||
|
self.flag = flag
|
||
|
|
||
|
@property
|
||
|
def embeddings(self) -> Optional[Embeddings]:
|
||
|
return self.embedding_func
|
||
|
|
||
|
@classmethod
|
||
|
def from_documents(
|
||
|
cls: Type[Vearch],
|
||
|
documents: List[Document],
|
||
|
embedding: Embeddings,
|
||
|
path_or_url: Optional[str] = None,
|
||
|
table_name: str = _DEFAULT_TABLE_NAME,
|
||
|
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
|
||
|
flag: int = _DEFAULT_VERSION,
|
||
|
**kwargs: Any,
|
||
|
) -> Vearch:
|
||
|
"""Return Vearch VectorStore"""
|
||
|
|
||
|
texts = [d.page_content for d in documents]
|
||
|
metadatas = [d.metadata for d in documents]
|
||
|
|
||
|
return cls.from_texts(
|
||
|
texts=texts,
|
||
|
embedding=embedding,
|
||
|
metadatas=metadatas,
|
||
|
path_or_url=path_or_url,
|
||
|
table_name=table_name,
|
||
|
db_name=db_name,
|
||
|
flag=flag,
|
||
|
**kwargs,
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def from_texts(
|
||
|
cls: Type[Vearch],
|
||
|
texts: List[str],
|
||
|
embedding: Embeddings,
|
||
|
metadatas: Optional[List[dict]] = None,
|
||
|
path_or_url: Optional[str] = None,
|
||
|
table_name: str = _DEFAULT_TABLE_NAME,
|
||
|
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
|
||
|
flag: int = _DEFAULT_VERSION,
|
||
|
**kwargs: Any,
|
||
|
) -> Vearch:
|
||
|
"""Return Vearch VectorStore"""
|
||
|
|
||
|
vearch_db = cls(
|
||
|
embedding_function=embedding,
|
||
|
embedding=embedding,
|
||
|
path_or_url=path_or_url,
|
||
|
db_name=db_name,
|
||
|
table_name=table_name,
|
||
|
flag=flag,
|
||
|
)
|
||
|
vearch_db.add_texts(texts=texts, metadatas=metadatas)
|
||
|
return vearch_db
|
||
|
|
||
|
def _create_table(
|
||
|
self,
|
||
|
dim: int = 1024,
|
||
|
field_list: List[dict] = [
|
||
|
{"field": "text", "type": "str"},
|
||
|
{"field": "metadata", "type": "str"},
|
||
|
],
|
||
|
) -> int:
|
||
|
"""
|
||
|
Create VectorStore Table
|
||
|
Args:
|
||
|
dim:dimension of vector
|
||
|
fields_list: the field you want to store
|
||
|
Return:
|
||
|
code,0 for success,1 for failed
|
||
|
"""
|
||
|
|
||
|
type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING}
|
||
|
engine_info = {
|
||
|
"index_size": 10000,
|
||
|
"retrieval_type": "IVFPQ",
|
||
|
"retrieval_param": {"ncentroids": 2048, "nsubvector": 32},
|
||
|
}
|
||
|
fields = [
|
||
|
vearch.GammaFieldInfo(fi["field"], type_dict[fi["type"]])
|
||
|
for fi in field_list
|
||
|
]
|
||
|
vector_field = vearch.GammaVectorInfo(
|
||
|
name="text_embedding",
|
||
|
type=vearch.dataType.VECTOR,
|
||
|
is_index=True,
|
||
|
dimension=dim,
|
||
|
model_id="",
|
||
|
store_type="MemoryOnly",
|
||
|
store_param={"cache_size": 10000},
|
||
|
has_source=False,
|
||
|
)
|
||
|
response_code = self.vearch.create_table(
|
||
|
engine_info,
|
||
|
name=self.using_table_name,
|
||
|
fields=fields,
|
||
|
vector_field=vector_field,
|
||
|
)
|
||
|
return response_code
|
||
|
|
||
|
def _create_space(
|
||
|
self,
|
||
|
dim: int = 1024,
|
||
|
) -> int:
|
||
|
"""
|
||
|
Create VectorStore space
|
||
|
Args:
|
||
|
dim:dimension of vector
|
||
|
Return:
|
||
|
code,0 failed for ,1 for success
|
||
|
"""
|
||
|
space_config = {
|
||
|
"name": self.using_table_name,
|
||
|
"partition_num": 1,
|
||
|
"replica_num": 1,
|
||
|
"engine": {
|
||
|
"name": "gamma",
|
||
|
"index_size": 1,
|
||
|
"retrieval_type": "FLAT",
|
||
|
"retrieval_param": {
|
||
|
"metric_type": "L2",
|
||
|
},
|
||
|
},
|
||
|
"properties": {
|
||
|
"text": {
|
||
|
"type": "string",
|
||
|
},
|
||
|
"metadata": {
|
||
|
"type": "string",
|
||
|
},
|
||
|
"text_embedding": {
|
||
|
"type": "vector",
|
||
|
"index": True,
|
||
|
"dimension": dim,
|
||
|
"store_type": "MemoryOnly",
|
||
|
},
|
||
|
},
|
||
|
}
|
||
|
response_code = self.vearch.create_space(self.using_db_name, space_config)
|
||
|
|
||
|
return response_code
|
||
|
|
||
|
def add_texts(
|
||
|
self,
|
||
|
texts: Iterable[str],
|
||
|
metadatas: Optional[List[dict]] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> List[str]:
|
||
|
"""
|
||
|
Returns:
|
||
|
List of ids from adding the texts into the vectorstore.
|
||
|
"""
|
||
|
embeddings = None
|
||
|
if self.embedding_func is not None:
|
||
|
embeddings = self.embedding_func.embed_documents(list(texts))
|
||
|
if embeddings is None:
|
||
|
raise ValueError("embeddings is None")
|
||
|
if self.flag:
|
||
|
dbs_list = self.vearch.list_dbs()
|
||
|
if self.using_db_name not in dbs_list:
|
||
|
create_db_code = self.vearch.create_db(self.using_db_name)
|
||
|
if not create_db_code:
|
||
|
raise ValueError("create db failed!!!")
|
||
|
space_list = self.vearch.list_spaces(self.using_db_name)
|
||
|
if self.using_table_name not in space_list:
|
||
|
create_space_code = self._create_space(len(embeddings[0]))
|
||
|
if not create_space_code:
|
||
|
raise ValueError("create space failed!!!")
|
||
|
docid = []
|
||
|
if embeddings is not None and metadatas is not None:
|
||
|
for text, metadata, embed in zip(texts, metadatas, embeddings):
|
||
|
profiles: dict[str, Any] = {}
|
||
|
profiles["text"] = text
|
||
|
profiles["metadata"] = metadata["source"]
|
||
|
embed_np = np.array(embed)
|
||
|
profiles["text_embedding"] = {
|
||
|
"feature": (embed_np / np.linalg.norm(embed_np)).tolist()
|
||
|
}
|
||
|
insert_res = self.vearch.insert_one(
|
||
|
self.using_db_name, self.using_table_name, profiles
|
||
|
)
|
||
|
if insert_res["status"] == 200:
|
||
|
docid.append(insert_res["_id"])
|
||
|
continue
|
||
|
else:
|
||
|
retry_insert = self.vearch.insert_one(
|
||
|
self.using_db_name, self.using_table_name, profiles
|
||
|
)
|
||
|
docid.append(retry_insert["_id"])
|
||
|
continue
|
||
|
else:
|
||
|
table_path = os.path.join(
|
||
|
self.using_metapath, self.using_table_name + ".schema"
|
||
|
)
|
||
|
if not os.path.exists(table_path):
|
||
|
dim = len(embeddings[0])
|
||
|
response_code = self._create_table(dim)
|
||
|
if response_code:
|
||
|
raise ValueError("create table failed!!!")
|
||
|
if embeddings is not None and metadatas is not None:
|
||
|
doc_items = []
|
||
|
for text, metadata, embed in zip(texts, metadatas, embeddings):
|
||
|
profiles_v: dict[str, Any] = {}
|
||
|
profiles_v["text"] = text
|
||
|
profiles_v["metadata"] = metadata["source"]
|
||
|
embed_np = np.array(embed)
|
||
|
profiles_v["text_embedding"] = embed_np / np.linalg.norm(embed_np)
|
||
|
doc_items.append(profiles_v)
|
||
|
|
||
|
docid = self.vearch.add(doc_items)
|
||
|
t_time = 0
|
||
|
while len(docid) != len(embeddings):
|
||
|
time.sleep(0.5)
|
||
|
if t_time > 6:
|
||
|
break
|
||
|
t_time += 1
|
||
|
self.vearch.dump()
|
||
|
return docid
|
||
|
|
||
|
def _load(self) -> None:
|
||
|
"""
|
||
|
load vearch engine for standalone vearch
|
||
|
"""
|
||
|
self.vearch.load()
|
||
|
|
||
|
@classmethod
|
||
|
def load_local(
|
||
|
cls,
|
||
|
embedding: Embeddings,
|
||
|
path_or_url: Optional[str] = None,
|
||
|
table_name: str = _DEFAULT_TABLE_NAME,
|
||
|
db_name: str = _DEFAULT_CLUSTER_DB_NAME,
|
||
|
flag: int = _DEFAULT_VERSION,
|
||
|
**kwargs: Any,
|
||
|
) -> Vearch:
|
||
|
"""Load the local specified table of standalone vearch.
|
||
|
Returns:
|
||
|
Success or failure of loading the local specified table
|
||
|
"""
|
||
|
if not path_or_url:
|
||
|
raise ValueError("No metadata path!!!")
|
||
|
if not table_name:
|
||
|
raise ValueError("No table name!!!")
|
||
|
table_path = os.path.join(path_or_url, table_name + ".schema")
|
||
|
if not os.path.exists(table_path):
|
||
|
raise ValueError("vearch vectorbase table not exist!!!")
|
||
|
|
||
|
vearch_db = cls(
|
||
|
embedding_function=embedding,
|
||
|
path_or_url=path_or_url,
|
||
|
table_name=table_name,
|
||
|
db_name=db_name,
|
||
|
flag=flag,
|
||
|
)
|
||
|
vearch_db._load()
|
||
|
return vearch_db
|
||
|
|
||
|
def similarity_search(
|
||
|
self,
|
||
|
query: str,
|
||
|
k: int = DEFAULT_TOPN,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Document]:
|
||
|
"""
|
||
|
Return docs most similar to query.
|
||
|
|
||
|
"""
|
||
|
if self.embedding_func is None:
|
||
|
raise ValueError("embedding_func is None!!!")
|
||
|
embeddings = self.embedding_func.embed_query(query)
|
||
|
docs = self.similarity_search_by_vector(embeddings, k)
|
||
|
return docs
|
||
|
|
||
|
def similarity_search_by_vector(
|
||
|
self,
|
||
|
embedding: List[float],
|
||
|
k: int = DEFAULT_TOPN,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Document]:
|
||
|
"""The most k similar documents and scores of the specified query.
|
||
|
Args:
|
||
|
embeddings: embedding vector of the query.
|
||
|
k: The k most similar documents to the text query.
|
||
|
min_score: the score of similar documents to the text query
|
||
|
Returns:
|
||
|
The k most similar documents to the specified text query.
|
||
|
0 is dissimilar, 1 is the most similar.
|
||
|
"""
|
||
|
embed = np.array(embedding)
|
||
|
if self.flag:
|
||
|
query_data = {
|
||
|
"query": {
|
||
|
"sum": [
|
||
|
{
|
||
|
"field": "text_embedding",
|
||
|
"feature": (embed / np.linalg.norm(embed)).tolist(),
|
||
|
}
|
||
|
],
|
||
|
},
|
||
|
"size": k,
|
||
|
"fields": ["text", "metadata"],
|
||
|
}
|
||
|
query_result = self.vearch.search(
|
||
|
self.using_db_name, self.using_table_name, query_data
|
||
|
)
|
||
|
res = query_result["hits"]["hits"]
|
||
|
else:
|
||
|
query_data = {
|
||
|
"vector": [
|
||
|
{
|
||
|
"field": "text_embedding",
|
||
|
"feature": embed / np.linalg.norm(embed),
|
||
|
}
|
||
|
],
|
||
|
"fields": [],
|
||
|
"is_brute_search": 1,
|
||
|
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
|
||
|
"topn": k,
|
||
|
}
|
||
|
query_result = self.vearch.search(query_data)
|
||
|
res = query_result[0]["result_items"]
|
||
|
docs = []
|
||
|
for item in res:
|
||
|
content = ""
|
||
|
meta_data = {}
|
||
|
if self.flag:
|
||
|
item = item["_source"]
|
||
|
for item_key in item:
|
||
|
if item_key == "text":
|
||
|
content = item[item_key]
|
||
|
continue
|
||
|
if item_key == "metadata":
|
||
|
meta_data["source"] = item[item_key]
|
||
|
continue
|
||
|
docs.append(Document(page_content=content, metadata=meta_data))
|
||
|
return docs
|
||
|
|
||
|
def similarity_search_with_score(
|
||
|
self,
|
||
|
query: str,
|
||
|
k: int = DEFAULT_TOPN,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Tuple[Document, float]]:
|
||
|
"""The most k similar documents and scores of the specified query.
|
||
|
Args:
|
||
|
embeddings: embedding vector of the query.
|
||
|
k: The k most similar documents to the text query.
|
||
|
min_score: the score of similar documents to the text query
|
||
|
Returns:
|
||
|
The k most similar documents to the specified text query.
|
||
|
0 is dissimilar, 1 is the most similar.
|
||
|
"""
|
||
|
if self.embedding_func is None:
|
||
|
raise ValueError("embedding_func is None!!!")
|
||
|
embeddings = self.embedding_func.embed_query(query)
|
||
|
embed = np.array(embeddings)
|
||
|
if self.flag:
|
||
|
query_data = {
|
||
|
"query": {
|
||
|
"sum": [
|
||
|
{
|
||
|
"field": "text_embedding",
|
||
|
"feature": (embed / np.linalg.norm(embed)).tolist(),
|
||
|
}
|
||
|
],
|
||
|
},
|
||
|
"size": k,
|
||
|
"fields": ["text_embedding", "text", "metadata"],
|
||
|
}
|
||
|
query_result = self.vearch.search(
|
||
|
self.using_db_name, self.using_table_name, query_data
|
||
|
)
|
||
|
res = query_result["hits"]["hits"]
|
||
|
else:
|
||
|
query_data = {
|
||
|
"vector": [
|
||
|
{
|
||
|
"field": "text_embedding",
|
||
|
"feature": embed / np.linalg.norm(embed),
|
||
|
}
|
||
|
],
|
||
|
"fields": [],
|
||
|
"is_brute_search": 1,
|
||
|
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
|
||
|
"topn": k,
|
||
|
}
|
||
|
query_result = self.vearch.search(query_data)
|
||
|
res = query_result[0]["result_items"]
|
||
|
results: List[Tuple[Document, float]] = []
|
||
|
for item in res:
|
||
|
content = ""
|
||
|
meta_data = {}
|
||
|
if self.flag:
|
||
|
score = item["_score"]
|
||
|
item = item["_source"]
|
||
|
for item_key in item:
|
||
|
if item_key == "text":
|
||
|
content = item[item_key]
|
||
|
continue
|
||
|
if item_key == "metadata":
|
||
|
meta_data["source"] = item[item_key]
|
||
|
continue
|
||
|
if self.flag != 1 and item_key == "score":
|
||
|
score = item[item_key]
|
||
|
continue
|
||
|
tmp_res = (Document(page_content=content, metadata=meta_data), score)
|
||
|
results.append(tmp_res)
|
||
|
return results
|
||
|
|
||
|
def _similarity_search_with_relevance_scores(
|
||
|
self,
|
||
|
query: str,
|
||
|
k: int = 4,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Tuple[Document, float]]:
|
||
|
return self.similarity_search_with_score(query, k, **kwargs)
|
||
|
|
||
|
def delete(
|
||
|
self,
|
||
|
ids: Optional[List[str]] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> Optional[bool]:
|
||
|
"""Delete the documents which have the specified ids.
|
||
|
|
||
|
Args:
|
||
|
ids: The ids of the embedding vectors.
|
||
|
**kwargs: Other keyword arguments that subclasses might use.
|
||
|
Returns:
|
||
|
Optional[bool]: True if deletion is successful.
|
||
|
False otherwise, None if not implemented.
|
||
|
"""
|
||
|
|
||
|
ret: Optional[bool] = None
|
||
|
tmp_res = []
|
||
|
if ids is None or ids.__len__() == 0:
|
||
|
return ret
|
||
|
for _id in ids:
|
||
|
if self.flag:
|
||
|
ret = self.vearch.delete(self.using_db_name, self.using_table_name, _id)
|
||
|
else:
|
||
|
ret = self.vearch.del_doc(_id)
|
||
|
tmp_res.append(ret)
|
||
|
ret = all(i == 0 for i in tmp_res)
|
||
|
return ret
|
||
|
|
||
|
def get(
|
||
|
self,
|
||
|
ids: Optional[List[str]] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> Dict[str, Document]:
|
||
|
"""Return docs according ids.
|
||
|
|
||
|
Args:
|
||
|
ids: The ids of the embedding vectors.
|
||
|
Returns:
|
||
|
Documents which satisfy the input conditions.
|
||
|
"""
|
||
|
|
||
|
results: Dict[str, Document] = {}
|
||
|
if ids is None or ids.__len__() == 0:
|
||
|
return results
|
||
|
if self.flag:
|
||
|
query_data = {"query": {"ids": ids}}
|
||
|
docs_detail = self.vearch.mget_by_ids(
|
||
|
self.using_db_name, self.using_table_name, query_data
|
||
|
)
|
||
|
for record in docs_detail:
|
||
|
if record["found"] is False:
|
||
|
continue
|
||
|
content = ""
|
||
|
meta_info = {}
|
||
|
for field in record["_source"]:
|
||
|
if field == "text":
|
||
|
content = record["_source"][field]
|
||
|
continue
|
||
|
elif field == "metadata":
|
||
|
meta_info["source"] = record["_source"][field]
|
||
|
continue
|
||
|
results[record["_id"]] = Document(
|
||
|
page_content=content, metadata=meta_info
|
||
|
)
|
||
|
else:
|
||
|
for id in ids:
|
||
|
docs_detail = self.vearch.get_doc_by_id(id)
|
||
|
if docs_detail == {}:
|
||
|
continue
|
||
|
content = ""
|
||
|
meta_info = {}
|
||
|
for field in docs_detail:
|
||
|
if field == "text":
|
||
|
content = docs_detail[field]
|
||
|
continue
|
||
|
elif field == "metadata":
|
||
|
meta_info["source"] = docs_detail[field]
|
||
|
continue
|
||
|
results[docs_detail["_id"]] = Document(
|
||
|
page_content=content, metadata=meta_info
|
||
|
)
|
||
|
return results
|