You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/langchain_community/vectorstores/lancedb.py

682 lines
24 KiB
Python

from __future__ import annotations
import base64
import os
import uuid
import warnings
from typing import Any, Callable, Dict, Iterable, List, Optional, Type
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.utils import guard_import
from langchain_core.vectorstores import VectorStore
from langchain_community.vectorstores.utils import maximal_marginal_relevance
DEFAULT_K = 4 # Number of Documents to return.
def import_lancedb() -> Any:
"""Import lancedb package."""
return guard_import("lancedb")
def to_lance_filter(filter: Dict[str, str]) -> str:
"""Converts a dict filter to a LanceDB filter string."""
return " AND ".join([f"{k} = '{v}'" for k, v in filter.items()])
class LanceDB(VectorStore):
"""`LanceDB` vector store.
To use, you should have ``lancedb`` python package installed.
You can install it with ``pip install lancedb``.
Args:
connection: LanceDB connection to use. If not provided, a new connection
will be created.
embedding: Embedding to use for the vectorstore.
vector_key: Key to use for the vector in the database. Defaults to ``vector``.
id_key: Key to use for the id in the database. Defaults to ``id``.
text_key: Key to use for the text in the database. Defaults to ``text``.
table_name: Name of the table to use. Defaults to ``vectorstore``.
api_key: API key to use for LanceDB cloud database.
region: Region to use for LanceDB cloud database.
mode: Mode to use for adding data to the table. Defaults to ``overwrite``.
Example:
.. code-block:: python
vectorstore = LanceDB(uri='/lancedb', embedding_function)
vectorstore.add_texts(['text1', 'text2'])
result = vectorstore.similarity_search('text1')
"""
def __init__(
self,
connection: Optional[Any] = None,
embedding: Optional[Embeddings] = None,
uri: Optional[str] = "/tmp/lancedb",
vector_key: Optional[str] = "vector",
id_key: Optional[str] = "id",
text_key: Optional[str] = "text",
table_name: Optional[str] = "vectorstore",
api_key: Optional[str] = None,
region: Optional[str] = None,
mode: Optional[str] = "overwrite",
table: Optional[Any] = None,
distance: Optional[str] = "l2",
reranker: Optional[Any] = None,
relevance_score_fn: Optional[Callable[[float], float]] = None,
limit: int = DEFAULT_K,
):
"""Initialize with Lance DB vectorstore"""
lancedb = guard_import("lancedb")
self._embedding = embedding
self._vector_key = vector_key
self._id_key = id_key
self._text_key = text_key
self.api_key = api_key or os.getenv("LANCE_API_KEY") if api_key != "" else None
self.region = region
self.mode = mode
self.distance = distance
self.override_relevance_score_fn = relevance_score_fn
self.limit = limit
self._fts_index = None
if isinstance(reranker, lancedb.rerankers.Reranker):
self._reranker = reranker
elif reranker is None:
self._reranker = None
else:
raise ValueError(
"`reranker` has to be a lancedb.rerankers.Reranker object."
)
if isinstance(uri, str) and self.api_key is None:
if uri.startswith("db://"):
raise ValueError("API key is required for LanceDB cloud.")
if self._embedding is None:
raise ValueError("embedding object should be provided")
if isinstance(connection, lancedb.db.LanceDBConnection):
self._connection = connection
elif isinstance(connection, (str, lancedb.db.LanceTable)):
raise ValueError(
"`connection` has to be a lancedb.db.LanceDBConnection object.\
`lancedb.db.LanceTable` is deprecated."
)
else:
if self.api_key is None:
self._connection = lancedb.connect(uri)
else:
if isinstance(uri, str):
if uri.startswith("db://"):
self._connection = lancedb.connect(
uri, api_key=self.api_key, region=self.region
)
else:
self._connection = lancedb.connect(uri)
warnings.warn(
"api key provided with local uri.\
The data will be stored locally"
)
if table is not None:
try:
assert isinstance(
table, (lancedb.db.LanceTable, lancedb.remote.table.RemoteTable)
)
self._table = table
self._table_name = (
table.name if hasattr(table, "name") else "remote_table"
)
except AssertionError:
raise ValueError(
"""`table` has to be a lancedb.db.LanceTable or
lancedb.remote.table.RemoteTable object."""
)
else:
self._table = self.get_table(table_name, set_default=True)
def results_to_docs(self, results: Any, score: bool = False) -> Any:
columns = results.schema.names
if "_distance" in columns:
score_col = "_distance"
elif "_relevance_score" in columns:
score_col = "_relevance_score"
else:
score_col = None
if score_col is None or not score:
return [
Document(
page_content=results[self._text_key][idx].as_py(),
metadata=results["metadata"][idx].as_py(),
)
for idx in range(len(results))
]
elif score_col and score:
return [
(
Document(
page_content=results[self._text_key][idx].as_py(),
metadata=results["metadata"][idx].as_py(),
),
results[score_col][idx].as_py(),
)
for idx in range(len(results))
]
@property
def embeddings(self) -> Optional[Embeddings]:
return self._embedding
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Turn texts into embedding and add it to the database
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.
ids: Optional list of ids to associate with the texts.
Returns:
List of ids of the added texts.
"""
docs = []
ids = ids or [str(uuid.uuid4()) for _ in texts]
embeddings = self._embedding.embed_documents(list(texts)) # type: ignore
for idx, text in enumerate(texts):
embedding = embeddings[idx]
metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
docs.append(
{
self._vector_key: embedding,
self._id_key: ids[idx],
self._text_key: text,
"metadata": metadata,
}
)
tbl = self.get_table()
if tbl is None:
tbl = self._connection.create_table(self._table_name, data=docs)
self._table = tbl
else:
if self.api_key is None:
tbl.add(docs, mode=self.mode)
else:
tbl.add(docs)
self._fts_index = None
return ids
def get_table(
self, name: Optional[str] = None, set_default: Optional[bool] = False
) -> Any:
"""
Fetches a table object from the database.
Args:
name (str, optional): The name of the table to fetch. Defaults to None
and fetches current table object.
set_default (bool, optional): Sets fetched table as the default table.
Defaults to False.
Returns:
Any: The fetched table object.
Raises:
ValueError: If the specified table is not found in the database.
"""
if name is not None:
if set_default:
self._table_name = name
_name = self._table_name
else:
_name = name
else:
_name = self._table_name
try:
return self._connection.open_table(_name)
except Exception:
return None
def create_index(
self,
col_name: Optional[str] = None,
vector_col: Optional[str] = None,
num_partitions: Optional[int] = 256,
num_sub_vectors: Optional[int] = 96,
index_cache_size: Optional[int] = None,
metric: Optional[str] = "L2",
name: Optional[str] = None,
) -> None:
"""
Create a scalar(for non-vector cols) or a vector index on a table.
Make sure your vector column has enough data before creating an index on it.
Args:
vector_col: Provide if you want to create index on a vector column.
col_name: Provide if you want to create index on a non-vector column.
metric: Provide the metric to use for vector index. Defaults to 'L2'
choice of metrics: 'L2', 'dot', 'cosine'
num_partitions: Number of partitions to use for the index. Defaults to 256.
num_sub_vectors: Number of sub-vectors to use for the index. Defaults to 96.
index_cache_size: Size of the index cache. Defaults to None.
name: Name of the table to create index on. Defaults to None.
Returns:
None
"""
tbl = self.get_table(name)
if vector_col:
tbl.create_index(
metric=metric,
vector_column_name=vector_col,
num_partitions=num_partitions,
num_sub_vectors=num_sub_vectors,
index_cache_size=index_cache_size,
)
elif col_name:
tbl.create_scalar_index(col_name)
else:
raise ValueError("Provide either vector_col or col_name")
def encode_image(self, uri: str) -> str:
"""Get base64 string from image URI."""
with open(uri, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def add_images(
self,
uris: List[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more images through the embeddings and add to the vectorstore.
Args:
uris List[str]: File path to the image.
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
ids (Optional[List[str]], optional): Optional list of IDs.
Returns:
List[str]: List of IDs of the added images.
"""
tbl = self.get_table()
# Map from uris to b64 encoded strings
b64_texts = [self.encode_image(uri=uri) for uri in uris]
# Populate IDs
if ids is None:
ids = [str(uuid.uuid4()) for _ in uris]
embeddings = None
# Set embeddings
if self._embedding is not None and hasattr(self._embedding, "embed_image"):
embeddings = self._embedding.embed_image(uris=uris)
else:
raise ValueError(
"embedding object should be provided and must have embed_image method."
)
data = []
for idx, emb in enumerate(embeddings):
metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
data.append(
{
self._vector_key: emb,
self._id_key: ids[idx],
self._text_key: b64_texts[idx],
"metadata": metadata,
}
)
if tbl is None:
tbl = self._connection.create_table(self._table_name, data=data)
self._table = tbl
else:
tbl.add(data)
return ids
def _query(
self,
query: Any,
k: Optional[int] = None,
filter: Optional[Any] = None,
name: Optional[str] = None,
**kwargs: Any,
) -> Any:
if k is None:
k = self.limit
tbl = self.get_table(name)
if isinstance(filter, dict):
filter = to_lance_filter(filter)
prefilter = kwargs.get("prefilter", False)
query_type = kwargs.get("query_type", "vector")
lance_query = (
tbl.search(query=query, vector_column_name=self._vector_key)
.limit(k)
.where(filter, prefilter=prefilter)
)
if query_type == "hybrid" and self._reranker is not None:
lance_query.rerank(reranker=self._reranker)
docs = lance_query.to_arrow()
if len(docs) == 0:
warnings.warn("No results found for the query.")
return docs
def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
"""
if self.override_relevance_score_fn:
return self.override_relevance_score_fn
if self.distance == "cosine":
return self._cosine_relevance_score_fn
elif self.distance == "l2":
return self._euclidean_relevance_score_fn
elif self.distance == "ip":
return self._max_inner_product_relevance_score_fn
else:
raise ValueError(
"No supported normalization function"
f" for distance metric of type: {self.distance}."
"Consider providing relevance_score_fn to Chroma constructor."
)
def similarity_search_by_vector(
self,
embedding: List[float],
k: Optional[int] = None,
filter: Optional[Dict[str, str]] = None,
name: Optional[str] = None,
**kwargs: Any,
) -> Any:
"""
Return documents most similar to the query vector.
"""
if k is None:
k = self.limit
res = self._query(embedding, k, filter=filter, name=name, **kwargs)
return self.results_to_docs(res, score=kwargs.pop("score", False))
def similarity_search_by_vector_with_relevance_scores(
self,
embedding: List[float],
k: Optional[int] = None,
filter: Optional[Dict[str, str]] = None,
name: Optional[str] = None,
**kwargs: Any,
) -> Any:
"""
Return documents most similar to the query vector with relevance scores.
"""
if k is None:
k = self.limit
relevance_score_fn = self._select_relevance_score_fn()
docs_and_scores = self.similarity_search_by_vector(
embedding, k, score=True, **kwargs
)
return [
(doc, relevance_score_fn(float(score))) for doc, score in docs_and_scores
]
def similarity_search_with_score(
self,
query: str,
k: Optional[int] = None,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> Any:
"""Return documents most similar to the query with relevance scores."""
if k is None:
k = self.limit
score = kwargs.get("score", True)
name = kwargs.get("name", None)
query_type = kwargs.get("query_type", "vector")
if self._embedding is None:
raise ValueError("search needs an emmbedding function to be specified.")
if query_type == "fts" or query_type == "hybrid":
if self.api_key is None and self._fts_index is None:
tbl = self.get_table(name)
self._fts_index = tbl.create_fts_index(self._text_key, replace=True)
if query_type == "hybrid":
embedding = self._embedding.embed_query(query)
_query = (embedding, query)
else:
_query = query # type: ignore
res = self._query(_query, k, filter=filter, name=name, **kwargs)
return self.results_to_docs(res, score=score)
else:
raise NotImplementedError(
"Full text/ Hybrid search is not supported in LanceDB Cloud yet."
)
else:
embedding = self._embedding.embed_query(query)
res = self._query(embedding, k, filter=filter, **kwargs)
return self.results_to_docs(res, score=score)
def similarity_search(
self,
query: str,
k: Optional[int] = None,
name: Optional[str] = None,
filter: Optional[Any] = None,
fts: Optional[bool] = False,
**kwargs: Any,
) -> List[Document]:
"""Return documents most similar to the query
Args:
query: String to query the vectorstore with.
k: Number of documents to return.
filter (Optional[Dict]): Optional filter arguments
sql_filter(Optional[string]): SQL filter to apply to the query.
prefilter(Optional[bool]): Whether to apply the filter prior
to the vector search.
Raises:
ValueError: If the specified table is not found in the database.
Returns:
List of documents most similar to the query.
"""
res = self.similarity_search_with_score(
query=query, k=k, name=name, filter=filter, fts=fts, score=False, **kwargs
)
return res
def max_marginal_relevance_search(
self,
query: str,
k: Optional[int] = None,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
if k is None:
k = self.limit
if self._embedding is None:
raise ValueError(
"For MMR search, you must specify an embedding function on" "creation."
)
embedding = self._embedding.embed_query(query)
docs = self.max_marginal_relevance_search_by_vector(
embedding,
k,
fetch_k,
lambda_mult=lambda_mult,
filter=filter,
)
return docs
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: Optional[int] = None,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
Returns:
List of Documents selected by maximal marginal relevance.
"""
results = self._query(
query=embedding,
k=fetch_k,
filter=filter,
**kwargs,
)
mmr_selected = maximal_marginal_relevance(
np.array(embedding, dtype=np.float32),
results["vector"].to_pylist(),
k=k or self.limit,
lambda_mult=lambda_mult,
)
candidates = self.results_to_docs(results)
selected_results = [r for i, r in enumerate(candidates) if i in mmr_selected]
return selected_results
@classmethod
def from_texts(
cls: Type[LanceDB],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
connection: Optional[Any] = None,
vector_key: Optional[str] = "vector",
id_key: Optional[str] = "id",
text_key: Optional[str] = "text",
table_name: Optional[str] = "vectorstore",
api_key: Optional[str] = None,
region: Optional[str] = None,
mode: Optional[str] = "overwrite",
distance: Optional[str] = "l2",
reranker: Optional[Any] = None,
relevance_score_fn: Optional[Callable[[float], float]] = None,
**kwargs: Any,
) -> LanceDB:
instance = LanceDB(
connection=connection,
embedding=embedding,
vector_key=vector_key,
id_key=id_key,
text_key=text_key,
table_name=table_name,
api_key=api_key,
region=region,
mode=mode,
distance=distance,
reranker=reranker,
relevance_score_fn=relevance_score_fn,
**kwargs,
)
instance.add_texts(texts, metadatas=metadatas)
return instance
def delete(
self,
ids: Optional[List[str]] = None,
delete_all: Optional[bool] = None,
filter: Optional[str] = None,
drop_columns: Optional[List[str]] = None,
name: Optional[str] = None,
**kwargs: Any,
) -> None:
"""
Allows deleting rows by filtering, by ids or drop columns from the table.
Args:
filter: Provide a string SQL expression - "{col} {operation} {value}".
ids: Provide list of ids to delete from the table.
drop_columns: Provide list of columns to drop from the table.
delete_all: If True, delete all rows from the table.
"""
tbl = self.get_table(name)
if filter:
tbl.delete(filter)
elif ids:
tbl.delete("id in ('{}')".format(",".join(ids)))
elif drop_columns:
if self.api_key is not None:
raise NotImplementedError(
"Column operations currently not supported in LanceDB Cloud."
)
else:
tbl.drop_columns(drop_columns)
elif delete_all:
tbl.delete("true")
else:
raise ValueError("Provide either filter, ids, drop_columns or delete_all")