langchain/libs/community/langchain_community/vectorstores/hanavector.py
Harrison Chase 4eda647fdd
infra: add -p to mkdir in lint steps (#17013)
Previously, if this did not find a mypy cache then it wouldnt run

this makes it always run

adding mypy ignore comments with existing uncaught issues to unblock other prs

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
2024-02-05 11:22:06 -08:00

576 lines
21 KiB
Python

"""SAP HANA Cloud Vector Engine"""
from __future__ import annotations
import importlib.util
import json
import re
from typing import (
TYPE_CHECKING,
Callable,
Iterable,
List,
Optional,
Tuple,
Type,
)
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.runnables.config import run_in_executor
from langchain_core.vectorstores import VectorStore
from langchain_community.vectorstores.utils import (
DistanceStrategy,
maximal_marginal_relevance,
)
if TYPE_CHECKING:
from hdbcli import dbapi
HANA_DISTANCE_FUNCTION: dict = {
DistanceStrategy.COSINE: ("COSINE_SIMILARITY", "DESC"),
DistanceStrategy.EUCLIDEAN_DISTANCE: ("L2DISTANCE", "ASC"),
}
default_distance_strategy = DistanceStrategy.COSINE
default_table_name: str = "EMBEDDINGS"
default_content_column: str = "VEC_TEXT"
default_metadata_column: str = "VEC_META"
default_vector_column: str = "VEC_VECTOR"
default_vector_column_length: int = -1 # -1 means dynamic length
class HanaDB(VectorStore):
"""SAP HANA Cloud Vector Engine
The prerequisite for using this class is the installation of the ``hdbcli``
Python package.
The HanaDB vectorstore can be created by providing an embedding function and
an existing database connection. Optionally, the names of the table and the
columns to use.
"""
def __init__(
self,
connection: dbapi.Connection,
embedding: Embeddings,
distance_strategy: DistanceStrategy = default_distance_strategy,
table_name: str = default_table_name,
content_column: str = default_content_column,
metadata_column: str = default_metadata_column,
vector_column: str = default_vector_column,
vector_column_length: int = default_vector_column_length,
):
# Check if the hdbcli package is installed
if importlib.util.find_spec("hdbcli") is None:
raise ImportError(
"Could not import hdbcli python package. "
"Please install it with `pip install hdbcli`."
)
valid_distance = False
for key in HANA_DISTANCE_FUNCTION.keys():
if key is distance_strategy:
valid_distance = True
if not valid_distance:
raise ValueError(
"Unsupported distance_strategy: {}".format(distance_strategy)
)
self.connection = connection
self.embedding = embedding
self.distance_strategy = distance_strategy
self.table_name = HanaDB._sanitize_name(table_name)
self.content_column = HanaDB._sanitize_name(content_column)
self.metadata_column = HanaDB._sanitize_name(metadata_column)
self.vector_column = HanaDB._sanitize_name(vector_column)
self.vector_column_length = HanaDB._sanitize_int(vector_column_length)
# Check if the table exists, and eventually create it
if not self._table_exists(self.table_name):
sql_str = (
f"CREATE TABLE {self.table_name}("
f"{self.content_column} NCLOB, "
f"{self.metadata_column} NCLOB, "
f"{self.vector_column} REAL_VECTOR "
)
if self.vector_column_length == -1:
sql_str += ");"
else:
sql_str += f"({self.vector_column_length}));"
try:
cur = self.connection.cursor()
cur.execute(sql_str)
finally:
cur.close()
# Check if the needed columns exist and have the correct type
self._check_column(self.table_name, self.content_column, ["NCLOB", "NVARCHAR"])
self._check_column(self.table_name, self.metadata_column, ["NCLOB", "NVARCHAR"])
self._check_column(
self.table_name,
self.vector_column,
["REAL_VECTOR"],
self.vector_column_length,
)
def _table_exists(self, table_name) -> bool: # type: ignore[no-untyped-def]
sql_str = (
"SELECT COUNT(*) FROM SYS.TABLES WHERE SCHEMA_NAME = CURRENT_SCHEMA"
" AND TABLE_NAME = ?"
)
try:
cur = self.connection.cursor()
cur.execute(sql_str, (table_name))
if cur.has_result_set():
rows = cur.fetchall()
if rows[0][0] == 1:
return True
finally:
cur.close()
return False
def _check_column(self, table_name, column_name, column_type, column_length=None): # type: ignore[no-untyped-def]
sql_str = (
"SELECT DATA_TYPE_NAME, LENGTH FROM SYS.TABLE_COLUMNS WHERE "
"SCHEMA_NAME = CURRENT_SCHEMA "
"AND TABLE_NAME = ? AND COLUMN_NAME = ?"
)
try:
cur = self.connection.cursor()
cur.execute(sql_str, (table_name, column_name))
if cur.has_result_set():
rows = cur.fetchall()
if len(rows) == 0:
raise AttributeError(f"Column {column_name} does not exist")
# Check data type
if rows[0][0] not in column_type:
raise AttributeError(
f"Column {column_name} has the wrong type: {rows[0][0]}"
)
# Check length, if parameter was provided
if column_length is not None:
if rows[0][1] != column_length:
raise AttributeError(
f"Column {column_name} has the wrong length: {rows[0][1]}"
)
else:
raise AttributeError(f"Column {column_name} does not exist")
finally:
cur.close()
@property
def embeddings(self) -> Embeddings:
return self.embedding
def _sanitize_name(input_str: str) -> str: # type: ignore[misc]
# Remove characters that are not alphanumeric or underscores
return re.sub(r"[^a-zA-Z0-9_]", "", input_str)
def _sanitize_int(input_int: any) -> int: # type: ignore[valid-type]
value = int(str(input_int))
if value < -1:
raise ValueError(f"Value ({value}) must not be smaller than -1")
return int(str(input_int))
def _sanitize_list_float(embedding: List[float]) -> List[float]: # type: ignore[misc]
for value in embedding:
if not isinstance(value, float):
raise ValueError(f"Value ({value}) does not have type float")
return embedding
# Compile pattern only once, for better performance
_compiled_pattern = re.compile("^[_a-zA-Z][_a-zA-Z0-9]*$")
def _sanitize_metadata_keys(metadata: dict) -> dict: # type: ignore[misc]
for key in metadata.keys():
if not HanaDB._compiled_pattern.match(key):
raise ValueError(f"Invalid metadata key {key}")
return metadata
def add_texts( # type: ignore[override]
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
embeddings: Optional[List[List[float]]] = None,
) -> List[str]:
"""Add more texts to the vectorstore.
Args:
texts (Iterable[str]): Iterable of strings/text to add to the vectorstore.
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
Defaults to None.
embeddings (Optional[List[List[float]]], optional): Optional pre-generated
embeddings. Defaults to None.
Returns:
List[str]: empty list
"""
# Create all embeddings of the texts beforehand to improve performance
if embeddings is None:
embeddings = self.embedding.embed_documents(list(texts))
cur = self.connection.cursor()
try:
# Insert data into the table
for i, text in enumerate(texts):
# Use provided values by default or fallback
metadata = metadatas[i] if metadatas else {}
embedding = (
embeddings[i]
if embeddings
else self.embedding.embed_documents([text])[0]
)
sql_str = (
f"INSERT INTO {self.table_name} ({self.content_column}, "
f"{self.metadata_column}, {self.vector_column}) "
f"VALUES (?, ?, TO_REAL_VECTOR (?));"
)
cur.execute(
sql_str,
(
text,
json.dumps(HanaDB._sanitize_metadata_keys(metadata)),
f"[{','.join(map(str, embedding))}]",
),
)
finally:
cur.close()
return []
@classmethod
def from_texts( # type: ignore[no-untyped-def, override]
cls: Type[HanaDB],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
connection: dbapi.Connection = None,
distance_strategy: DistanceStrategy = default_distance_strategy,
table_name: str = default_table_name,
content_column: str = default_content_column,
metadata_column: str = default_metadata_column,
vector_column: str = default_vector_column,
vector_column_length: int = default_vector_column_length,
):
"""Create a HanaDB instance from raw documents.
This is a user-friendly interface that:
1. Embeds documents.
2. Creates a table if it does not yet exist.
3. Adds the documents to the table.
This is intended to be a quick way to get started.
"""
instance = cls(
connection=connection,
embedding=embedding,
distance_strategy=distance_strategy,
table_name=table_name,
content_column=content_column,
metadata_column=metadata_column,
vector_column=vector_column,
vector_column_length=vector_column_length, # -1 means dynamic length
)
instance.add_texts(texts, metadatas)
return instance
def similarity_search( # type: ignore[override]
self, query: str, k: int = 4, filter: Optional[dict] = None
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: A dictionary of metadata fields and values to filter by.
Defaults to None.
Returns:
List of Documents most similar to the query
"""
docs_and_scores = self.similarity_search_with_score(
query=query, k=k, filter=filter
)
return [doc for doc, _ in docs_and_scores]
def similarity_search_with_score(
self, query: str, k: int = 4, filter: Optional[dict] = None
) -> List[Tuple[Document, float]]:
"""Return documents and score values most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: A dictionary of metadata fields and values to filter by.
Defaults to None.
Returns:
List of tuples (containing a Document and a score) that are
most similar to the query
"""
embedding = self.embedding.embed_query(query)
return self.similarity_search_with_score_by_vector(
embedding=embedding, k=k, filter=filter
)
def similarity_search_with_score_and_vector_by_vector(
self, embedding: List[float], k: int = 4, filter: Optional[dict] = None
) -> List[Tuple[Document, float, List[float]]]:
"""Return docs most similar to the given embedding.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: A dictionary of metadata fields and values to filter by.
Defaults to None.
Returns:
List of Documents most similar to the query and
score and the document's embedding vector for each
"""
result = []
k = HanaDB._sanitize_int(k)
embedding = HanaDB._sanitize_list_float(embedding)
distance_func_name = HANA_DISTANCE_FUNCTION[self.distance_strategy][0]
embedding_as_str = ",".join(map(str, embedding))
sql_str = (
f"SELECT TOP {k}"
f" {self.content_column}, " # row[0]
f" {self.metadata_column}, " # row[1]
f" TO_NVARCHAR({self.vector_column}), " # row[2]
f" {distance_func_name}({self.vector_column}, TO_REAL_VECTOR "
f" (ARRAY({embedding_as_str}))) AS CS " # row[3]
f"FROM {self.table_name}"
)
order_str = f" order by CS {HANA_DISTANCE_FUNCTION[self.distance_strategy][1]}"
where_str, query_tuple = self._create_where_by_filter(filter)
sql_str = sql_str + where_str
sql_str = sql_str + order_str
try:
cur = self.connection.cursor()
cur.execute(sql_str, query_tuple)
if cur.has_result_set():
rows = cur.fetchall()
for row in rows:
js = json.loads(row[1])
doc = Document(page_content=row[0], metadata=js)
result_vector = HanaDB._parse_float_array_from_string(row[2])
result.append((doc, row[3], result_vector))
finally:
cur.close()
return result
def similarity_search_with_score_by_vector(
self, embedding: List[float], k: int = 4, filter: Optional[dict] = None
) -> List[Tuple[Document, float]]:
"""Return docs most similar to the given embedding.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: A dictionary of metadata fields and values to filter by.
Defaults to None.
Returns:
List of Documents most similar to the query and score for each
"""
whole_result = self.similarity_search_with_score_and_vector_by_vector(
embedding=embedding, k=k, filter=filter
)
return [(result_item[0], result_item[1]) for result_item in whole_result]
def similarity_search_by_vector( # type: ignore[override]
self, embedding: List[float], k: int = 4, filter: Optional[dict] = None
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: A dictionary of metadata fields and values to filter by.
Defaults to None.
Returns:
List of Documents most similar to the query vector.
"""
docs_and_scores = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k, filter=filter
)
return [doc for doc, _ in docs_and_scores]
def _create_where_by_filter(self, filter): # type: ignore[no-untyped-def]
query_tuple = []
where_str = ""
if filter:
for i, key in enumerate(filter.keys()):
if i == 0:
where_str += " WHERE "
else:
where_str += " AND "
where_str += f" JSON_VALUE({self.metadata_column}, '$.{key}') = ?"
if isinstance(filter[key], bool):
if filter[key]:
query_tuple.append("true")
else:
query_tuple.append("false")
elif isinstance(filter[key], int) or isinstance(filter[key], str):
query_tuple.append(filter[key])
else:
raise ValueError(
f"Unsupported filter data-type: {type(filter[key])}"
)
return where_str, query_tuple
def delete( # type: ignore[override]
self, ids: Optional[List[str]] = None, filter: Optional[dict] = None
) -> Optional[bool]:
"""Delete entries by filter with metadata values
Args:
ids: Deletion with ids is not supported! A ValueError will be raised.
filter: A dictionary of metadata fields and values to filter by.
An empty filter ({}) will delete all entries in the table.
Returns:
Optional[bool]: True, if deletion is technically successful.
Deletion of zero entries, due to non-matching filters is a success.
"""
if ids is not None:
raise ValueError("Deletion via ids is not supported")
if filter is None:
raise ValueError("Parameter 'filter' is required when calling 'delete'")
where_str, query_tuple = self._create_where_by_filter(filter)
sql_str = f"DELETE FROM {self.table_name} {where_str}"
try:
cur = self.connection.cursor()
cur.execute(sql_str, query_tuple)
finally:
cur.close()
return True
async def adelete( # type: ignore[override]
self, ids: Optional[List[str]] = None, filter: Optional[dict] = None
) -> Optional[bool]:
"""Delete by vector ID or other criteria.
Args:
ids: List of ids to delete.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise, None if not implemented.
"""
return await run_in_executor(None, self.delete, ids=ids, filter=filter)
def max_marginal_relevance_search( # type: ignore[override]
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[dict] = None,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: search query text.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter: Filter on metadata properties, e.g.
{
"str_property": "foo",
"int_property": 123
}
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self.embedding.embed_query(query)
return self.max_marginal_relevance_search_by_vector(
embedding=embedding,
k=k,
fetch_k=fetch_k,
lambda_mult=lambda_mult,
filter=filter,
)
def _parse_float_array_from_string(array_as_string: str) -> List[float]: # type: ignore[misc]
array_wo_brackets = array_as_string[1:-1]
return [float(x) for x in array_wo_brackets.split(",")]
def max_marginal_relevance_search_by_vector( # type: ignore[override]
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[dict] = None,
) -> List[Document]:
whole_result = self.similarity_search_with_score_and_vector_by_vector(
embedding=embedding, k=fetch_k, filter=filter
)
embeddings = [result_item[2] for result_item in whole_result]
mmr_doc_indexes = maximal_marginal_relevance(
np.array(embedding), embeddings, lambda_mult=lambda_mult, k=k
)
return [whole_result[i][0] for i in mmr_doc_indexes]
async def amax_marginal_relevance_search_by_vector( # type: ignore[override]
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance."""
return await run_in_executor(
None,
self.max_marginal_relevance_search_by_vector,
embedding=embedding,
k=k,
fetch_k=fetch_k,
lambda_mult=lambda_mult,
)
@staticmethod
def _cosine_relevance_score_fn(distance: float) -> float:
return distance
def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
Vectorstores should define their own selection based method of relevance.
"""
if self.distance_strategy == DistanceStrategy.COSINE:
return HanaDB._cosine_relevance_score_fn
elif self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
return HanaDB._euclidean_relevance_score_fn
else:
raise ValueError(
"Unsupported distance_strategy: {}".format(self.distance_strategy)
)