You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/vectorstores/manticore_search.py

373 lines
12 KiB
Python

from __future__ import annotations
import json
import logging
import uuid
from hashlib import sha1
from typing import Any, Dict, Iterable, List, Optional, Type
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseSettings
from langchain_core.vectorstores import VectorStore
logger = logging.getLogger()
DEFAULT_K = 4 # Number of Documents to return.
class ManticoreSearchSettings(BaseSettings):
proto: str = "http"
host: str = "localhost"
port: int = 9308
username: Optional[str] = None
password: Optional[str] = None
# database: str = "Manticore"
table: str = "langchain"
column_map: Dict[str, str] = {
"id": "id",
"uuid": "uuid",
"document": "document",
"embedding": "embedding",
"metadata": "metadata",
}
# A mandatory setting; currently, only hnsw is supported.
knn_type: str = "hnsw"
# A mandatory setting that specifies the dimensions of the vectors being indexed.
knn_dims: Optional[int] = None # Defaults autodetect
# A mandatory setting that specifies the distance function used by the HNSW index.
hnsw_similarity: str = "L2" # Acceptable values are: L2, IP, COSINE
# An optional setting that defines the maximum amount of outgoing connections
# in the graph.
hnsw_m: int = 16 # The default is 16.
# An optional setting that defines a construction time/accuracy trade-off.
hnsw_ef_construction = 100
def get_connection_string(self) -> str:
return self.proto + "://" + self.host + ":" + str(self.port)
def __getitem__(self, item: str) -> Any:
return getattr(self, item)
class Config:
env_file = ".env"
env_prefix = "manticore_"
env_file_encoding = "utf-8"
class ManticoreSearch(VectorStore):
"""
`ManticoreSearch Engine` vector store.
To use, you should have the ``manticoresearch`` python package installed.
Example:
.. code-block:: python
from langchain_community.vectorstores import Manticore
from langchain_community.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vectorstore = ManticoreSearch(embeddings)
"""
def __init__(
self,
embedding: Embeddings,
*,
config: Optional[ManticoreSearchSettings] = None,
**kwargs: Any,
) -> None:
"""
ManticoreSearch Wrapper to LangChain
Args:
embedding (Embeddings): Text embedding model.
config (ManticoreSearchSettings): Configuration of ManticoreSearch Client
**kwargs: Other keyword arguments will pass into Configuration of API client
manticoresearch-python. See
https://github.com/manticoresoftware/manticoresearch-python for more.
"""
try:
import manticoresearch.api as ENDPOINTS
import manticoresearch.api_client as API
except ImportError:
raise ImportError(
"Could not import manticoresearch python package. "
"Please install it with `pip install manticoresearch-dev`."
)
try:
from tqdm import tqdm
self.pgbar = tqdm
except ImportError:
# Just in case if tqdm is not installed
self.pgbar = lambda x, **kwargs: x
super().__init__()
self.embedding = embedding
if config is not None:
self.config = config
else:
self.config = ManticoreSearchSettings()
assert self.config
assert self.config.host and self.config.port
assert (
self.config.column_map
# and self.config.database
and self.config.table
)
assert (
self.config.knn_type
# and self.config.knn_dims
# and self.config.hnsw_m
# and self.config.hnsw_ef_construction
and self.config.hnsw_similarity
)
for k in ["id", "embedding", "document", "metadata", "uuid"]:
assert k in self.config.column_map
# Detect embeddings dimension
if self.config.knn_dims is None:
self.dim: int = len(self.embedding.embed_query("test"))
else:
self.dim = self.config.knn_dims
# Initialize the schema
self.schema = f"""\
CREATE TABLE IF NOT EXISTS {self.config.table}(
{self.config.column_map['id']} bigint,
{self.config.column_map['document']} text indexed stored,
{self.config.column_map['embedding']} \
float_vector knn_type='{self.config.knn_type}' \
knn_dims='{self.dim}' \
hnsw_similarity='{self.config.hnsw_similarity}' \
hnsw_m='{self.config.hnsw_m}' \
hnsw_ef_construction='{self.config.hnsw_ef_construction}',
{self.config.column_map['metadata']} json,
{self.config.column_map['uuid']} text indexed stored
)\
"""
# Create a connection to ManticoreSearch
self.configuration = API.Configuration(
host=self.config.get_connection_string(),
username=self.config.username,
password=self.config.password,
# disabled_client_side_validations=",",
**kwargs,
)
self.connection = API.ApiClient(self.configuration)
self.client = {
"index": ENDPOINTS.IndexApi(self.connection),
"utils": ENDPOINTS.UtilsApi(self.connection),
"search": ENDPOINTS.SearchApi(self.connection),
}
# Create default schema if not exists
self.client["utils"].sql(self.schema)
@property
def embeddings(self) -> Embeddings:
return self.embedding
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
*,
batch_size: int = 32,
text_ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""
Insert more texts through the embeddings and add to the VectorStore.
Args:
texts: Iterable of strings to add to the VectorStore
metadata: Optional column data to be inserted
batch_size: Batch size of insertion
ids: Optional list of ids to associate with the texts
Returns:
List of ids from adding the texts into the VectorStore.
"""
# Embed and create the documents
ids = text_ids or [
# See https://stackoverflow.com/questions/67219691/python-hash-function-that-returns-32-or-64-bits
str(int(sha1(t.encode("utf-8")).hexdigest()[:15], 16))
for t in texts
]
transac = []
for i, text in enumerate(texts):
embed = self.embeddings.embed_query(text)
doc_uuid = str(uuid.uuid1())
doc = {
self.config.column_map["document"]: text,
self.config.column_map["embedding"]: embed,
self.config.column_map["metadata"]: metadatas[i] if metadatas else {},
self.config.column_map["uuid"]: doc_uuid,
}
transac.append(
{"replace": {"index": self.config.table, "id": ids[i], "doc": doc}}
)
if len(transac) == batch_size:
body = "\n".join(map(json.dumps, transac))
try:
self.client["index"].bulk(body)
transac = []
except Exception as e:
logger.info(f"Error indexing documents: {e}")
if len(transac) > 0:
body = "\n".join(map(json.dumps, transac))
try:
self.client["index"].bulk(body)
except Exception as e:
logger.info(f"Error indexing documents: {e}")
return ids
@classmethod
def from_texts(
cls: Type[ManticoreSearch],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[Dict[Any, Any]]] = None,
*,
config: Optional[ManticoreSearchSettings] = None,
text_ids: Optional[List[str]] = None,
batch_size: int = 32,
**kwargs: Any,
) -> ManticoreSearch:
ctx = cls(embedding, config=config, **kwargs)
ctx.add_texts(
texts=texts,
embedding=embedding,
text_ids=text_ids,
batch_size=batch_size,
metadatas=metadatas,
**kwargs,
)
return ctx
@classmethod
def from_documents(
cls: Type[ManticoreSearch],
documents: List[Document],
embedding: Embeddings,
*,
config: Optional[ManticoreSearchSettings] = None,
text_ids: Optional[List[str]] = None,
batch_size: int = 32,
**kwargs: Any,
) -> ManticoreSearch:
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
return cls.from_texts(
texts=texts,
embedding=embedding,
text_ids=text_ids,
batch_size=batch_size,
metadatas=metadatas,
**kwargs,
)
def __repr__(self) -> str:
"""
Text representation for ManticoreSearch Vector Store, prints backends, username
and schemas. Easy to use with `str(ManticoreSearch())`
Returns:
repr: string to show connection info and data schema
"""
_repr = f"\033[92m\033[1m{self.config.table} @ "
_repr += f"http://{self.config.host}:{self.config.port}\033[0m\n\n"
_repr += f"\033[1musername: {self.config.username}\033[0m\n\nTable Schema:\n"
_repr += "-" * 51 + "\n"
for r in self.client["utils"].sql(f"DESCRIBE {self.config.table}")[0]["data"]:
_repr += (
f"|\033[94m{r['Field']:24s}\033[0m|\033["
f"96m{r['Type'] + ' ' + r['Properties']:24s}\033[0m|\n"
)
_repr += "-" * 51 + "\n"
return _repr
def similarity_search(
self, query: str, k: int = DEFAULT_K, **kwargs: Any
) -> List[Document]:
"""Perform a similarity search with ManticoreSearch
Args:
query (str): query string
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
Returns:
List[Document]: List of Documents
"""
return self.similarity_search_by_vector(
self.embedding.embed_query(query), k, **kwargs
)
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = DEFAULT_K,
**kwargs: Any,
) -> List[Document]:
"""Perform a similarity search with ManticoreSearch by vectors
Args:
embedding (List[float]): Embedding vector
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
Returns:
List[Document]: List of documents
"""
# Build search request
request = {
"index": self.config.table,
"knn": {
"field": self.config.column_map["embedding"],
"k": k,
"query_vector": embedding,
},
}
# Execute request and convert response to langchain.Document format
try:
return [
Document(
page_content=r["_source"][self.config.column_map["document"]],
metadata=r["_source"][self.config.column_map["metadata"]],
)
for r in self.client["search"].search(request, **kwargs).hits.hits[:k]
]
except Exception as e:
logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
return []
def drop(self) -> None:
"""
Helper function: Drop data
"""
self.client["utils"].sql(f"DROP TABLE IF EXISTS {self.config.table}")
@property
def metadata_column(self) -> str:
return self.config.column_map["metadata"]