mirror of https://github.com/hwchase17/langchain
Add Vearch vectorstore (#9846)
--------- Co-authored-by: zhanghexian1 <zhanghexian1@jd.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>pull/10291/head
parent
e93240f023
commit
62fa2bc518
@ -0,0 +1,15 @@
|
||||
# Vearch
|
||||
|
||||
Vearch is a scalable distributed system for efficient similarity search of deep learning vectors.
|
||||
|
||||
# Installation and Setup
|
||||
|
||||
Vearch Python SDK enables vearch to use locally. Vearch python sdk can be installed easily by pip install vearch.
|
||||
|
||||
# Vectorstore
|
||||
|
||||
Vearch also can used as vectorstore. Most detalis in [this notebook](docs/modules/indexes/vectorstores/examples/vearch.ipynb)
|
||||
|
||||
```python
|
||||
from langchain.vectorstores import Vearch
|
||||
```
|
@ -0,0 +1,401 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import vearch
|
||||
DEFAULT_TOPN = 4
|
||||
|
||||
|
||||
class VearchDb(VectorStore):
|
||||
_DEFAULT_TABLE_NAME = "langchain_vearch"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_function: Embeddings,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
metadata_path: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize vearch vector store"""
|
||||
try:
|
||||
import vearch
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import vearch python package. "
|
||||
"Please install it with `pip install vearch`."
|
||||
)
|
||||
|
||||
if metadata_path is None:
|
||||
metadata_path = os.getcwd().replace("\\", "/")
|
||||
if not os.path.isdir(metadata_path):
|
||||
os.makedirs(metadata_path)
|
||||
log_path = os.path.join(metadata_path, "log")
|
||||
if not os.path.isdir(log_path):
|
||||
os.makedirs(log_path)
|
||||
self.vearch_engine = vearch.Engine(metadata_path, log_path)
|
||||
|
||||
if not table_name:
|
||||
table_name = self._DEFAULT_TABLE_NAME
|
||||
table_name += "_"
|
||||
table_name += str(uuid.uuid4()).split("-")[-1]
|
||||
self.using_table_name = table_name
|
||||
self.using_metapath = metadata_path
|
||||
self.embedding_func = embedding_function
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Optional[Embeddings]:
|
||||
return self.embedding_func
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls: Type[VearchDb],
|
||||
documents: List[Document],
|
||||
embedding: Embeddings,
|
||||
table_name: str = "langchain_vearch",
|
||||
metadata_path: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> VearchDb:
|
||||
"""Return Vearch VectorStore"""
|
||||
|
||||
texts = [d.page_content for d in documents]
|
||||
metadatas = [d.metadata for d in documents]
|
||||
|
||||
return cls.from_texts(
|
||||
texts=texts,
|
||||
embedding=embedding,
|
||||
metadatas=metadatas,
|
||||
table_name=table_name,
|
||||
metadata_path=metadata_path,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[VearchDb],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
metadata_path: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> VearchDb:
|
||||
"""Return Vearch VectorStore"""
|
||||
|
||||
vearch_db = cls(
|
||||
embedding_function=embedding,
|
||||
table_name=table_name,
|
||||
metadata_path=metadata_path,
|
||||
)
|
||||
vearch_db.add_texts(texts=texts, metadatas=metadatas)
|
||||
return vearch_db
|
||||
|
||||
def _create_table(
|
||||
self,
|
||||
dim: int = 1024,
|
||||
filed_list: List[dict] = [
|
||||
{"filed": "text", "type": "str"},
|
||||
{"filed": "metadata", "type": "str"},
|
||||
],
|
||||
) -> int:
|
||||
"""
|
||||
Create VectorStore Table
|
||||
Args:
|
||||
dim:dimension of vector
|
||||
fileds_list: the filed you want to store
|
||||
Return:
|
||||
code,0 for success,1 for failed
|
||||
"""
|
||||
type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING}
|
||||
engine_info = {
|
||||
"index_size": 10000,
|
||||
"retrieval_type": "IVFPQ",
|
||||
"retrieval_param": {"ncentroids": 2048, "nsubvector": 32},
|
||||
}
|
||||
fields = [
|
||||
vearch.GammaFieldInfo(fi["filed"], type_dict[fi["type"]])
|
||||
for fi in filed_list
|
||||
]
|
||||
vector_field = vearch.GammaVectorInfo(
|
||||
name="text_embedding",
|
||||
type=vearch.dataType.VECTOR,
|
||||
is_index=True,
|
||||
dimension=dim,
|
||||
model_id="",
|
||||
store_type="MemoryOnly",
|
||||
store_param={"cache_size": 10000},
|
||||
has_source=False,
|
||||
)
|
||||
response_code = self.vearch_engine.create_table(
|
||||
engine_info,
|
||||
name=self.using_table_name,
|
||||
fields=fields,
|
||||
vector_field=vector_field,
|
||||
)
|
||||
return response_code
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
embeddings = None
|
||||
if self.embedding_func is not None:
|
||||
embeddings = self.embedding_func.embed_documents(list(texts))
|
||||
table_path = os.path.join(
|
||||
self.using_metapath, self.using_table_name + ".schema"
|
||||
)
|
||||
if not os.path.exists(table_path):
|
||||
if embeddings is None:
|
||||
raise ValueError("embeddings is None")
|
||||
dim = len(embeddings[0])
|
||||
response_code = self._create_table(dim)
|
||||
if response_code:
|
||||
raise ValueError("create table failed!!!")
|
||||
if embeddings is not None and metadatas is not None:
|
||||
doc_items = []
|
||||
for text, metadata, embed in zip(texts, metadatas, embeddings):
|
||||
profiles: dict[str, Any] = {}
|
||||
profiles["text"] = text
|
||||
profiles["metadata"] = metadata["source"]
|
||||
profiles["text_embedding"] = embed
|
||||
doc_items.append(profiles)
|
||||
|
||||
docid = self.vearch_engine.add(doc_items)
|
||||
t_time = 0
|
||||
while len(docid) != len(embeddings):
|
||||
time.sleep(0.5)
|
||||
if t_time > 6:
|
||||
break
|
||||
t_time += 1
|
||||
self.vearch_engine.dump()
|
||||
return docid
|
||||
|
||||
def _load(self) -> None:
|
||||
"""
|
||||
load vearch engine
|
||||
"""
|
||||
self.vearch_engine.load()
|
||||
|
||||
@classmethod
|
||||
def load_local(
|
||||
cls,
|
||||
embedding: Embeddings,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
metadata_path: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> VearchDb:
|
||||
"""Load the local specified table.
|
||||
Returns:
|
||||
Success or failure of loading the local specified table
|
||||
"""
|
||||
if not metadata_path:
|
||||
raise ValueError("No metadata path!!!")
|
||||
if not table_name:
|
||||
raise ValueError("No table name!!!")
|
||||
table_path = os.path.join(metadata_path, table_name + ".schema")
|
||||
if not os.path.exists(table_path):
|
||||
raise ValueError("vearch vectorbase table not exist!!!")
|
||||
vearch_db = cls(
|
||||
embedding_function=embedding,
|
||||
table_name=table_name,
|
||||
metadata_path=metadata_path,
|
||||
)
|
||||
vearch_db._load()
|
||||
return vearch_db
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Return docs most similar to query.
|
||||
|
||||
"""
|
||||
if self.vearch_engine is None:
|
||||
raise ValueError("Vearch engine is None!!!")
|
||||
if self.embedding_func is None:
|
||||
raise ValueError("embedding_func is None!!!")
|
||||
embeddings = self.embedding_func.embed_query(query)
|
||||
docs = self.similarity_search_by_vector(embeddings, k)
|
||||
return docs
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""The most k similar documents and scores of the specified query.
|
||||
|
||||
Args:
|
||||
embeddings: embedding vector of the query.
|
||||
k: The k most similar documents to the text query.
|
||||
min_score: the score of similar documents to the text query
|
||||
Returns:
|
||||
The k most similar documents to the specified text query.
|
||||
0 is dissimilar, 1 is the most similar.
|
||||
"""
|
||||
query_data = {
|
||||
"vector": [
|
||||
{
|
||||
"field": "text_embedding",
|
||||
"feature": np.array(embedding),
|
||||
}
|
||||
],
|
||||
"fields": [],
|
||||
"is_brute_search": 1,
|
||||
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
|
||||
"topn": k,
|
||||
}
|
||||
query_result = self.vearch_engine.search(query_data)
|
||||
docs = []
|
||||
for item in query_result[0]["result_items"]:
|
||||
content = ""
|
||||
meta_data = {}
|
||||
for item_key in item:
|
||||
if item_key == "text":
|
||||
content = item[item_key]
|
||||
continue
|
||||
if item_key == "metadata":
|
||||
meta_data["source"] = item[item_key]
|
||||
continue
|
||||
docs.append(Document(page_content=content, metadata=meta_data))
|
||||
return docs
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""The most k similar documents and scores of the specified query.
|
||||
|
||||
Args:
|
||||
embeddings: embedding vector of the query.
|
||||
k: The k most similar documents to the text query.
|
||||
min_score: the score of similar documents to the text query
|
||||
Returns:
|
||||
The k most similar documents to the specified text query.
|
||||
0 is dissimilar, 1 is the most similar.
|
||||
"""
|
||||
if self.embedding_func is None:
|
||||
raise ValueError("embedding_func is None!!!")
|
||||
embeddings = self.embedding_func.embed_query(query)
|
||||
query_data = {
|
||||
"vector": [
|
||||
{
|
||||
"field": "text_embedding",
|
||||
"feature": np.array(embeddings),
|
||||
}
|
||||
],
|
||||
"fields": [],
|
||||
"is_brute_search": 1,
|
||||
"retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20},
|
||||
"topn": k,
|
||||
}
|
||||
query_result = self.vearch_engine.search(query_data)
|
||||
results: List[Tuple[Document, float]] = []
|
||||
for item in query_result[0]["result_items"]:
|
||||
content = ""
|
||||
meta_data = {}
|
||||
for item_key in item:
|
||||
if item_key == "text":
|
||||
content = item[item_key]
|
||||
continue
|
||||
if item_key == "metadata":
|
||||
meta_data["source"] = item[item_key]
|
||||
continue
|
||||
if item_key == "score":
|
||||
score = item[item_key]
|
||||
continue
|
||||
tmp_res = (Document(page_content=content, metadata=meta_data), score)
|
||||
results.append(tmp_res)
|
||||
return results
|
||||
|
||||
def _similarity_search_with_relevance_scores(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
return self.similarity_search_with_score(query, k, **kwargs)
|
||||
|
||||
def delete(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Optional[bool]:
|
||||
"""Delete the documents which have the specified ids.
|
||||
|
||||
Args:
|
||||
ids: The ids of the embedding vectors.
|
||||
**kwargs: Other keyword arguments that subclasses might use.
|
||||
Returns:
|
||||
Optional[bool]: True if deletion is successful.
|
||||
False otherwise, None if not implemented.
|
||||
"""
|
||||
if self.vearch_engine is None:
|
||||
raise ValueError("Verach Engine is None!!!")
|
||||
ret: Optional[bool] = None
|
||||
tmp_res = []
|
||||
if ids is None or ids.__len__() == 0:
|
||||
return ret
|
||||
for _id in ids:
|
||||
ret = self.vearch_engine.del_doc(_id)
|
||||
tmp_res.append(ret)
|
||||
ret = all(i == 0 for i in tmp_res)
|
||||
return ret
|
||||
|
||||
def get(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Document]:
|
||||
"""Return docs according ids.
|
||||
|
||||
Args:
|
||||
ids: The ids of the embedding vectors.
|
||||
Returns:
|
||||
Documents which satisfy the input conditions.
|
||||
"""
|
||||
|
||||
if self.vearch_engine is None:
|
||||
raise ValueError("vearch engine is None!!!")
|
||||
results: Dict[str, Document] = {}
|
||||
if ids is None or ids.__len__() == 0:
|
||||
return results
|
||||
for id in ids:
|
||||
docs_detail = self.vearch_engine.get_doc_by_id(id)
|
||||
if docs_detail == {}:
|
||||
continue
|
||||
|
||||
content = ""
|
||||
meta_info = {}
|
||||
for field in docs_detail:
|
||||
if field == "text":
|
||||
content = docs_detail[field]
|
||||
continue
|
||||
elif field == "metadata":
|
||||
meta_info["source"] = docs_detail[field]
|
||||
continue
|
||||
results[docs_detail["_id"]] = Document(
|
||||
page_content=content, metadata=meta_info
|
||||
)
|
||||
return results
|
Loading…
Reference in New Issue