FEAT: Merge TileDB vecstore (#12811)

pull/12817/head
Bagatur 9 months ago committed by GitHub
commit 658a3a8607
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,178 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "25bce5eb-8599-40fe-947e-4932cfae8184",
"metadata": {},
"source": [
"# TileDB\n",
"\n",
"> [TileDB](https://github.com/TileDB-Inc/TileDB) is a powerful engine for indexing and querying dense and sparse multi-dimensional arrays.\n",
"\n",
"> TileDB offers ANN search capabilities using the [TileDB-Vector-Search](https://github.com/TileDB-Inc/TileDB-Vector-Search) module. It provides serverless execution of ANN queries and storage of vector indexes both on local disk and cloud object stores (i.e. AWS S3).\n",
"\n",
"More details in:\n",
"- [Why TileDB as a Vector Database](https://tiledb.com/blog/why-tiledb-as-a-vector-database)\n",
"- [TileDB 101: Vector Search](https://tiledb.com/blog/tiledb-101-vector-search)\n",
"\n",
"This notebook shows how to use the `TileDB` vector database."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f45f46f2-7229-4859-9797-30bbead1b8e0",
"metadata": {},
"outputs": [],
"source": [
"!pip install tiledb-vector-search"
]
},
{
"cell_type": "markdown",
"id": "2f65caa9-8383-409a-bccb-6e91fc8d5e8f",
"metadata": {},
"source": [
"## Basic Example"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c96d4fe0",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import TextLoader\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import TileDB\n",
"\n",
"raw_documents = TextLoader(\"../../modules/state_of_the_union.txt\").load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"documents = text_splitter.split_documents(raw_documents)\n",
"embeddings = HuggingFaceEmbeddings()\n",
"db = TileDB.from_documents(\n",
" documents, embeddings, index_uri=\"/tmp/tiledb_index\", index_type=\"FLAT\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0a6797c-2bb0-45db-a636-5d2437f7a4c0",
"metadata": {},
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = db.similarity_search(query)\n",
"docs[0].page_content"
]
},
{
"cell_type": "markdown",
"id": "c4c4e06d-6def-44ce-ac9a-4c01673c29a2",
"metadata": {},
"source": [
"### Similarity search by vector"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1eb72610-d451-4158-880c-9f0d45fa5909",
"metadata": {},
"outputs": [],
"source": [
"embedding_vector = embeddings.embed_query(query)\n",
"docs = db.similarity_search_by_vector(embedding_vector)\n",
"docs[0].page_content"
]
},
{
"cell_type": "markdown",
"id": "d33588d4-67c2-4bd3-b251-76ae783cbafb",
"metadata": {},
"source": [
"### Similarity search with score"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a41e382-0336-4e6d-b2ef-44cc77db2696",
"metadata": {},
"outputs": [],
"source": [
"docs_and_scores = db.similarity_search_with_score(query)\n",
"docs_and_scores[0]"
]
},
{
"cell_type": "markdown",
"id": "57f930f2-41a0-4795-ad9e-44a33c8f88ec",
"metadata": {},
"source": [
"## Maximal Marginal Relevance Search (MMR)"
]
},
{
"cell_type": "markdown",
"id": "4790e437-3207-45cb-b121-d857ab5aabd8",
"metadata": {},
"source": [
"In addition to using similarity search in the retriever object, you can also use `mmr` as retriever."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "495754b1-5cdb-4af6-9733-f68700bb7232",
"metadata": {},
"outputs": [],
"source": [
"retriever = db.as_retriever(search_type=\"mmr\")\n",
"retriever.get_relevant_documents(query)"
]
},
{
"cell_type": "markdown",
"id": "e213d957-e439-4bd6-90f2-8909323f5f09",
"metadata": {},
"source": [
"Or use `max_marginal_relevance_search` directly:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99d928d0-3b79-4588-925e-32230e12af47",
"metadata": {},
"outputs": [],
"source": [
"db.max_marginal_relevance_search(query, k=2, fetch_k=10)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -350,6 +350,12 @@ def _import_tencentvectordb() -> Any:
return TencentVectorDB
def _import_tiledb() -> Any:
from langchain.vectorstores.tiledb import TileDB
return TileDB
def _import_tigris() -> Any:
from langchain.vectorstores.tigris import Tigris
@ -517,6 +523,8 @@ def __getattr__(name: str) -> Any:
return _import_tair()
elif name == "TencentVectorDB":
return _import_tencentvectordb()
elif name == "TileDB":
return _import_tiledb()
elif name == "Tigris":
return _import_tigris()
elif name == "TimescaleVector":
@ -594,6 +602,7 @@ __all__ = [
"StarRocks",
"SupabaseVectorStore",
"Tair",
"TileDB",
"Tigris",
"TimescaleVector",
"Typesense",

@ -0,0 +1,789 @@
"""Wrapper around TileDB vector database."""
from __future__ import annotations
import pickle
import random
import sys
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
import numpy as np
from langchain.docstore.document import Document
from langchain.schema.embeddings import Embeddings
from langchain.schema.vectorstore import VectorStore
from langchain.vectorstores.utils import maximal_marginal_relevance
INDEX_METRICS = frozenset(["euclidean"])
DEFAULT_METRIC = "euclidean"
DOCUMENTS_ARRAY_NAME = "documents"
VECTOR_INDEX_NAME = "vectors"
MAX_UINT64 = np.iinfo(np.dtype("uint64")).max
MAX_FLOAT_32 = np.finfo(np.dtype("float32")).max
MAX_FLOAT = sys.float_info.max
def dependable_tiledb_import() -> Any:
"""Import tiledb-vector-search if available, otherwise raise error."""
try:
import tiledb as tiledb
import tiledb.vector_search as tiledb_vs
except ImportError:
raise ValueError(
"Could not import tiledb-vector-search python package. "
"Please install it with `conda install -c tiledb tiledb-vector-search` "
"or `pip install tiledb-vector-search`"
)
return tiledb_vs, tiledb
def get_vector_index_uri_from_group(group: Any) -> str:
return group[VECTOR_INDEX_NAME].uri
def get_documents_array_uri_from_group(group: Any) -> str:
return group[DOCUMENTS_ARRAY_NAME].uri
def get_vector_index_uri(uri: str) -> str:
return f"{uri}/{VECTOR_INDEX_NAME}"
def get_documents_array_uri(uri: str) -> str:
return f"{uri}/{DOCUMENTS_ARRAY_NAME}"
class TileDB(VectorStore):
"""Wrapper around TileDB vector database.
To use, you should have the ``tiledb-vector-search`` python package installed.
Example:
.. code-block:: python
from langchain import TileDB
embeddings = OpenAIEmbeddings()
db = TileDB(embeddings, index_uri, metric)
"""
def __init__(
self,
embedding: Embeddings,
index_uri: str,
metric: str,
*,
vector_index_uri: str = "",
docs_array_uri: str = "",
config: Optional[Mapping[str, Any]] = None,
timestamp: Any = None,
**kwargs: Any,
):
"""Initialize with necessary components."""
self.embedding = embedding
self.embedding_function = embedding.embed_query
self.index_uri = index_uri
self.metric = metric
self.config = config
tiledb_vs, tiledb = dependable_tiledb_import()
with tiledb.scope_ctx(ctx_or_config=config):
index_group = tiledb.Group(self.index_uri, "r")
self.vector_index_uri = (
vector_index_uri
if vector_index_uri != ""
else get_vector_index_uri_from_group(index_group)
)
self.docs_array_uri = (
docs_array_uri
if docs_array_uri != ""
else get_documents_array_uri_from_group(index_group)
)
index_group.close()
group = tiledb.Group(self.vector_index_uri, "r")
self.index_type = group.meta.get("index_type")
group.close()
self.timestamp = timestamp
if self.index_type == "FLAT":
self.vector_index = tiledb_vs.flat_index.FlatIndex(
uri=self.vector_index_uri,
config=self.config,
timestamp=self.timestamp,
**kwargs,
)
elif self.index_type == "IVF_FLAT":
self.vector_index = tiledb_vs.ivf_flat_index.IVFFlatIndex(
uri=self.vector_index_uri,
config=self.config,
timestamp=self.timestamp,
**kwargs,
)
@property
def embeddings(self) -> Optional[Embeddings]:
return self.embedding
def process_index_results(
self,
ids: List[int],
scores: List[float],
*,
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
score_threshold: float = MAX_FLOAT,
) -> List[Tuple[Document, float]]:
"""Turns TileDB results into a list of documents and scores.
Args:
ids: List of indices of the documents in the index.
scores: List of distances of the documents in the index.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None.
score_threshold: Optional, a floating point value to filter the
resulting set of retrieved docs
Returns:
List of Documents and scores.
"""
tiledb_vs, tiledb = dependable_tiledb_import()
docs = []
docs_array = tiledb.open(
self.docs_array_uri, "r", timestamp=self.timestamp, config=self.config
)
for idx, score in zip(ids, scores):
if idx == 0 and score == 0:
continue
if idx == MAX_UINT64 and score == MAX_FLOAT_32:
continue
doc = docs_array[idx]
if doc is None or len(doc["text"]) == 0:
raise ValueError(f"Could not find document for id {idx}, got {doc}")
pickled_metadata = doc.get("metadata")
result_doc = Document(page_content=str(doc["text"][0]))
if pickled_metadata is not None:
metadata = pickle.loads(
np.array(pickled_metadata.tolist()).astype(np.uint8).tobytes()
)
result_doc.metadata = metadata
if filter is not None:
filter = {
key: [value] if not isinstance(value, list) else value
for key, value in filter.items()
}
if all(
result_doc.metadata.get(key) in value
for key, value in filter.items()
):
docs.append((result_doc, score))
else:
docs.append((result_doc, score))
docs_array.close()
docs = [(doc, score) for doc, score in docs if score <= score_threshold]
return docs[:k]
def similarity_search_with_score_by_vector(
self,
embedding: List[float],
*,
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
embedding: Embedding vector to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None.
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
Defaults to 20.
**kwargs: kwargs to be passed to similarity search. Can include:
nprobe: Optional, number of partitions to check if using IVF_FLAT index
score_threshold: Optional, a floating point value to filter the
resulting set of retrieved docs
Returns:
List of documents most similar to the query text and distance
in float for each. Lower score represents more similarity.
"""
if "score_threshold" in kwargs:
score_threshold = kwargs.pop("score_threshold")
else:
score_threshold = MAX_FLOAT
d, i = self.vector_index.query(
np.array([np.array(embedding).astype(np.float32)]).astype(np.float32),
k=k if filter is None else fetch_k,
**kwargs,
)
return self.process_index_results(
ids=i[0], scores=d[0], filter=filter, k=k, score_threshold=score_threshold
)
def similarity_search_with_score(
self,
query: str,
*,
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
Defaults to 20.
Returns:
List of documents most similar to the query text with
Distance as float. Lower score represents more similarity.
"""
embedding = self.embedding_function(query)
docs = self.similarity_search_with_score_by_vector(
embedding,
k=k,
filter=filter,
fetch_k=fetch_k,
**kwargs,
)
return docs
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
Defaults to 20.
Returns:
List of Documents most similar to the embedding.
"""
docs_and_scores = self.similarity_search_with_score_by_vector(
embedding,
k=k,
filter=filter,
fetch_k=fetch_k,
**kwargs,
)
return [doc for doc, _ in docs_and_scores]
def similarity_search(
self,
query: str,
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
Defaults to 20.
Returns:
List of Documents most similar to the query.
"""
docs_and_scores = self.similarity_search_with_score(
query, k=k, filter=filter, fetch_k=fetch_k, **kwargs
)
return [doc for doc, _ in docs_and_scores]
def max_marginal_relevance_search_with_score_by_vector(
self,
embedding: List[float],
*,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and their similarity scores selected using the maximal marginal
relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch before filtering to
pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents and similarity scores selected by maximal marginal
relevance and score for each.
"""
if "score_threshold" in kwargs:
score_threshold = kwargs.pop("score_threshold")
else:
score_threshold = MAX_FLOAT
scores, indices = self.vector_index.query(
np.array([np.array(embedding).astype(np.float32)]).astype(np.float32),
k=fetch_k if filter is None else fetch_k * 2,
**kwargs,
)
results = self.process_index_results(
ids=indices[0],
scores=scores[0],
filter=filter,
k=fetch_k if filter is None else fetch_k * 2,
score_threshold=score_threshold,
)
embeddings = [
self.embedding.embed_documents([doc.page_content])[0] for doc, _ in results
]
mmr_selected = maximal_marginal_relevance(
np.array([embedding], dtype=np.float32),
embeddings,
k=k,
lambda_mult=lambda_mult,
)
docs_and_scores = []
for i in mmr_selected:
docs_and_scores.append(results[i])
return docs_and_scores
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch before filtering to
pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
docs_and_scores = self.max_marginal_relevance_search_with_score_by_vector(
embedding,
k=k,
fetch_k=fetch_k,
lambda_mult=lambda_mult,
filter=filter,
**kwargs,
)
return [doc for doc, _ in docs_and_scores]
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch before filtering (if needed) to
pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self.embedding_function(query)
docs = self.max_marginal_relevance_search_by_vector(
embedding,
k=k,
fetch_k=fetch_k,
lambda_mult=lambda_mult,
filter=filter,
**kwargs,
)
return docs
@classmethod
def create(
cls,
index_uri: str,
index_type: str,
dimensions: int,
vector_type: np.dtype,
*,
metadatas: bool = True,
config: Optional[Mapping[str, Any]] = None,
) -> None:
tiledb_vs, tiledb = dependable_tiledb_import()
with tiledb.scope_ctx(ctx_or_config=config):
try:
tiledb.group_create(index_uri)
except tiledb.TileDBError as err:
raise err
group = tiledb.Group(index_uri, "w")
vector_index_uri = get_vector_index_uri(group.uri)
docs_uri = get_documents_array_uri(group.uri)
if index_type == "FLAT":
tiledb_vs.flat_index.create(
uri=vector_index_uri,
dimensions=dimensions,
vector_type=vector_type,
config=config,
)
elif index_type == "IVF_FLAT":
tiledb_vs.ivf_flat_index.create(
uri=vector_index_uri,
dimensions=dimensions,
vector_type=vector_type,
config=config,
)
group.add(vector_index_uri, name=VECTOR_INDEX_NAME)
# Create TileDB array to store Documents
# TODO add a Document store API to tiledb-vector-search to allow storing
# different types of objects and metadata in a more generic way.
dim = tiledb.Dim(
name="id",
domain=(0, MAX_UINT64 - 1),
dtype=np.dtype(np.uint64),
)
dom = tiledb.Domain(dim)
text_attr = tiledb.Attr(name="text", dtype=np.dtype("U1"), var=True)
attrs = [text_attr]
if metadatas:
metadata_attr = tiledb.Attr(name="metadata", dtype=np.uint8, var=True)
attrs.append(metadata_attr)
schema = tiledb.ArraySchema(
domain=dom,
sparse=True,
allows_duplicates=False,
attrs=attrs,
)
tiledb.Array.create(docs_uri, schema)
group.add(docs_uri, name=DOCUMENTS_ARRAY_NAME)
group.close()
@classmethod
def __from(
cls,
texts: List[str],
embeddings: List[List[float]],
embedding: Embeddings,
index_uri: str,
*,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
metric: str = DEFAULT_METRIC,
index_type: str = "FLAT",
config: Optional[Mapping[str, Any]] = None,
index_timestamp: int = 0,
**kwargs: Any,
) -> TileDB:
if metric not in INDEX_METRICS:
raise ValueError(
(
f"Unsupported distance metric: {metric}. "
f"Expected one of {list(INDEX_METRICS)}"
)
)
tiledb_vs, tiledb = dependable_tiledb_import()
input_vectors = np.array(embeddings).astype(np.float32)
cls.create(
index_uri=index_uri,
index_type=index_type,
dimensions=input_vectors.shape[1],
vector_type=input_vectors.dtype,
metadatas=metadatas is not None,
config=config,
)
with tiledb.scope_ctx(ctx_or_config=config):
if not embeddings:
raise ValueError("embeddings must be provided to build a TileDB index")
vector_index_uri = get_vector_index_uri(index_uri)
docs_uri = get_documents_array_uri(index_uri)
if ids is None:
ids = [str(random.randint(0, MAX_UINT64 - 1)) for _ in texts]
external_ids = np.array(ids).astype(np.uint64)
tiledb_vs.ingestion.ingest(
index_type=index_type,
index_uri=vector_index_uri,
input_vectors=input_vectors,
external_ids=external_ids,
index_timestamp=index_timestamp if index_timestamp != 0 else None,
config=config,
**kwargs,
)
with tiledb.open(docs_uri, "w") as A:
if external_ids is None:
external_ids = np.zeros(len(texts), dtype=np.uint64)
for i in range(len(texts)):
external_ids[i] = i
data = {}
data["text"] = np.array(texts)
if metadatas is not None:
metadata_attr = np.empty([len(metadatas)], dtype=object)
i = 0
for metadata in metadatas:
metadata_attr[i] = np.frombuffer(
pickle.dumps(metadata), dtype=np.uint8
)
i += 1
data["metadata"] = metadata_attr
A[external_ids] = data
return cls(
embedding=embedding,
index_uri=index_uri,
metric=metric,
config=config,
**kwargs,
)
def delete(
self, ids: Optional[List[str]] = None, timestamp: int = 0, **kwargs: Any
) -> Optional[bool]:
"""Delete by vector ID or other criteria.
Args:
ids: List of ids to delete.
timestamp: Optional timestamp to delete with.
**kwargs: Other keyword arguments that subclasses might use.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise, None if not implemented.
"""
external_ids = np.array(ids).astype(np.uint64)
self.vector_index.delete_batch(
external_ids=external_ids, timestamp=timestamp if timestamp != 0 else None
)
return True
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
timestamp: int = 0,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional ids of each text object.
timestamp: Optional timestamp to write new texts with.
kwargs: vectorstore specific parameters
Returns:
List of ids from adding the texts into the vectorstore.
"""
tiledb_vs, tiledb = dependable_tiledb_import()
embeddings = self.embedding.embed_documents(list(texts))
if ids is None:
ids = [str(random.randint(0, MAX_UINT64 - 1)) for _ in texts]
external_ids = np.array(ids).astype(np.uint64)
vectors = np.empty((len(embeddings)), dtype="O")
for i in range(len(embeddings)):
vectors[i] = np.array(embeddings[i], dtype=np.float32)
self.vector_index.update_batch(
vectors=vectors,
external_ids=external_ids,
timestamp=timestamp if timestamp != 0 else None,
)
docs = {}
docs["text"] = np.array(texts)
if metadatas is not None:
metadata_attr = np.empty([len(metadatas)], dtype=object)
i = 0
for metadata in metadatas:
metadata_attr[i] = np.frombuffer(pickle.dumps(metadata), dtype=np.uint8)
i += 1
docs["metadata"] = metadata_attr
docs_array = tiledb.open(
self.docs_array_uri,
"w",
timestamp=timestamp if timestamp != 0 else None,
config=self.config,
)
docs_array[external_ids] = docs
docs_array.close()
return ids
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
metric: str = DEFAULT_METRIC,
index_uri: str = "/tmp/tiledb_array",
index_type: str = "FLAT",
config: Optional[Mapping[str, Any]] = None,
index_timestamp: int = 0,
**kwargs: Any,
) -> TileDB:
"""Construct a TileDB index from raw documents.
Args:
texts: List of documents to index.
embedding: Embedding function to use.
metadatas: List of metadata dictionaries to associate with documents.
ids: Optional ids of each text object.
metric: Metric to use for indexing. Defaults to "euclidean".
index_uri: The URI to write the TileDB arrays
index_type: Optional, Vector index type ("FLAT", IVF_FLAT")
config: Optional, TileDB config
index_timestamp: Optional, timestamp to write new texts with.
Example:
.. code-block:: python
from langchain import TileDB
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
index = TileDB.from_texts(texts, embeddings)
"""
embeddings = []
embeddings = embedding.embed_documents(texts)
return cls.__from(
texts=texts,
embeddings=embeddings,
embedding=embedding,
metadatas=metadatas,
ids=ids,
metric=metric,
index_uri=index_uri,
index_type=index_type,
config=config,
index_timestamp=index_timestamp,
**kwargs,
)
@classmethod
def from_embeddings(
cls,
text_embeddings: List[Tuple[str, List[float]]],
embedding: Embeddings,
index_uri: str,
*,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
metric: str = DEFAULT_METRIC,
index_type: str = "FLAT",
config: Optional[Mapping[str, Any]] = None,
index_timestamp: int = 0,
**kwargs: Any,
) -> TileDB:
"""Construct TileDB index from embeddings.
Args:
text_embeddings: List of tuples of (text, embedding)
embedding: Embedding function to use.
index_uri: The URI to write the TileDB arrays
metadatas: List of metadata dictionaries to associate with documents.
metric: Optional, Metric to use for indexing. Defaults to "euclidean".
index_type: Optional, Vector index type ("FLAT", IVF_FLAT")
config: Optional, TileDB config
index_timestamp: Optional, timestamp to write new texts with.
Example:
.. code-block:: python
from langchain import TileDB
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
text_embeddings = embeddings.embed_documents(texts)
text_embedding_pairs = list(zip(texts, text_embeddings))
db = TileDB.from_embeddings(text_embedding_pairs, embeddings)
"""
texts = [t[0] for t in text_embeddings]
embeddings = [t[1] for t in text_embeddings]
return cls.__from(
texts=texts,
embeddings=embeddings,
embedding=embedding,
metadatas=metadatas,
ids=ids,
metric=metric,
index_uri=index_uri,
index_type=index_type,
config=config,
index_timestamp=index_timestamp,
**kwargs,
)
@classmethod
def load(
cls,
index_uri: str,
embedding: Embeddings,
*,
metric: str = DEFAULT_METRIC,
config: Optional[Mapping[str, Any]] = None,
timestamp: Any = None,
**kwargs: Any,
) -> TileDB:
"""Load a TileDB index from a URI.
Args:
index_uri: The URI of the TileDB vector index.
embedding: Embeddings to use when generating queries.
metric: Optional, Metric to use for indexing. Defaults to "euclidean".
config: Optional, TileDB config
timestamp: Optional, timestamp to use for opening the arrays.
"""
return cls(
embedding=embedding,
index_uri=index_uri,
metric=metric,
config=config,
timestamp=timestamp,
**kwargs,
)
def consolidate_updates(self, **kwargs: Any) -> None:
self.vector_index = self.vector_index.consolidate_updates(**kwargs)

@ -0,0 +1,358 @@
from pathlib import Path
import numpy as np
import pytest
from langchain.docstore.document import Document
from langchain.vectorstores.tiledb import TileDB
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
FakeEmbeddings,
)
@pytest.mark.requires("tiledb-vector-search")
def test_tiledb(tmp_path: Path) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/flat",
index_type="FLAT",
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/ivf_flat",
index_type="IVF_FLAT",
)
output = docsearch.similarity_search(
"foo", k=1, nprobe=docsearch.vector_index.partitions
)
assert output == [Document(page_content="foo")]
@pytest.mark.requires("tiledb-vector-search")
def test_tiledb_vector_sim(tmp_path: Path) -> None:
"""Test vector similarity."""
texts = ["foo", "bar", "baz"]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/flat",
index_type="FLAT",
)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_by_vector(query_vec, k=1)
assert output == [Document(page_content="foo")]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/ivf_flat",
index_type="IVF_FLAT",
)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_by_vector(
query_vec, k=1, nprobe=docsearch.vector_index.partitions
)
assert output == [Document(page_content="foo")]
@pytest.mark.requires("tiledb-vector-search")
def test_tiledb_vector_sim_with_score_threshold(tmp_path: Path) -> None:
"""Test vector similarity."""
texts = ["foo", "bar", "baz"]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/flat",
index_type="FLAT",
)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_by_vector(query_vec, k=2, score_threshold=0.2)
assert output == [Document(page_content="foo")]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/ivf_flat",
index_type="IVF_FLAT",
)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_by_vector(
query_vec, k=2, score_threshold=0.2, nprobe=docsearch.vector_index.partitions
)
assert output == [Document(page_content="foo")]
@pytest.mark.requires("tiledb-vector-search")
def test_similarity_search_with_score_by_vector(tmp_path: Path) -> None:
"""Test vector similarity with score by vector."""
texts = ["foo", "bar", "baz"]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/flat",
index_type="FLAT",
)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_with_score_by_vector(query_vec, k=1)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo")
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/ivf_flat",
index_type="IVF_FLAT",
)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_with_score_by_vector(
query_vec, k=1, nprobe=docsearch.vector_index.partitions
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo")
@pytest.mark.requires("tiledb-vector-search")
def test_similarity_search_with_score_by_vector_with_score_threshold(
tmp_path: Path,
) -> None:
"""Test vector similarity with score by vector."""
texts = ["foo", "bar", "baz"]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/flat",
index_type="FLAT",
)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_with_score_by_vector(
query_vec,
k=2,
score_threshold=0.2,
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo")
assert output[0][1] < 0.2
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/ivf_flat",
index_type="IVF_FLAT",
)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_with_score_by_vector(
query_vec, k=2, score_threshold=0.2, nprobe=docsearch.vector_index.partitions
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo")
assert output[0][1] < 0.2
@pytest.mark.requires("tiledb-vector-search")
def test_tiledb_mmr(tmp_path: Path) -> None:
texts = ["foo", "foo", "fou", "foy"]
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/flat",
index_type="FLAT",
)
query_vec = ConsistentFakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=3, lambda_mult=0.1
)
assert output[0][0] == Document(page_content="foo")
assert output[0][1] == 0.0
assert output[1][0] != Document(page_content="foo")
assert output[2][0] != Document(page_content="foo")
docsearch = TileDB.from_texts(
texts=texts,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/ivf_flat",
index_type="IVF_FLAT",
)
query_vec = ConsistentFakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=3, lambda_mult=0.1, nprobe=docsearch.vector_index.partitions
)
assert output[0][0] == Document(page_content="foo")
assert output[0][1] == 0.0
assert output[1][0] != Document(page_content="foo")
assert output[2][0] != Document(page_content="foo")
@pytest.mark.requires("tiledb-vector-search")
def test_tiledb_mmr_with_metadatas_and_filter(tmp_path: Path) -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = TileDB.from_texts(
texts=texts,
metadatas=metadatas,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/flat",
index_type="FLAT",
)
query_vec = ConsistentFakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=3, lambda_mult=0.1, filter={"page": 1}
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
assert output[0][1] == 0.0
docsearch = TileDB.from_texts(
texts=texts,
metadatas=metadatas,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/ivf_flat",
index_type="IVF_FLAT",
)
query_vec = ConsistentFakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=3,
lambda_mult=0.1,
filter={"page": 1},
nprobe=docsearch.vector_index.partitions,
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
assert output[0][1] == 0.0
@pytest.mark.requires("tiledb-vector-search")
def test_tiledb_mmr_with_metadatas_and_list_filter(tmp_path: Path) -> None:
texts = ["foo", "fou", "foy", "foo"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = TileDB.from_texts(
texts=texts,
metadatas=metadatas,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/flat",
index_type="FLAT",
)
query_vec = ConsistentFakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=3, lambda_mult=0.1, filter={"page": [0, 1, 2]}
)
assert len(output) == 3
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] != Document(page_content="foo", metadata={"page": 0})
assert output[2][0] != Document(page_content="foo", metadata={"page": 0})
docsearch = TileDB.from_texts(
texts=texts,
metadatas=metadatas,
embedding=ConsistentFakeEmbeddings(),
index_uri=f"{str(tmp_path)}/ivf_flat",
index_type="IVF_FLAT",
)
query_vec = ConsistentFakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=3,
lambda_mult=0.1,
filter={"page": [0, 1, 2]},
nprobe=docsearch.vector_index.partitions,
)
assert len(output) == 3
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] != Document(page_content="foo", metadata={"page": 0})
assert output[2][0] != Document(page_content="foo", metadata={"page": 0})
@pytest.mark.requires("tiledb-vector-search")
def test_tiledb_flat_updates(tmp_path: Path) -> None:
"""Test end to end construction and search."""
dimensions = 10
index_uri = str(tmp_path)
embedding = ConsistentFakeEmbeddings(dimensionality=dimensions)
TileDB.create(
index_uri=index_uri,
index_type="FLAT",
dimensions=dimensions,
vector_type=np.dtype("float32"),
metadatas=False,
)
docsearch = TileDB.load(
index_uri=index_uri,
embedding=embedding,
)
output = docsearch.similarity_search("foo", k=2)
assert output == []
docsearch.add_texts(texts=["foo", "bar", "baz"], ids=["1", "2", "3"])
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
docsearch.delete(["1", "3"])
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="bar")]
output = docsearch.similarity_search("baz", k=1)
assert output == [Document(page_content="bar")]
docsearch.add_texts(texts=["fooo", "bazz"], ids=["4", "5"])
output = docsearch.similarity_search("fooo", k=1)
assert output == [Document(page_content="fooo")]
output = docsearch.similarity_search("bazz", k=1)
assert output == [Document(page_content="bazz")]
docsearch.consolidate_updates()
output = docsearch.similarity_search("fooo", k=1)
assert output == [Document(page_content="fooo")]
output = docsearch.similarity_search("bazz", k=1)
assert output == [Document(page_content="bazz")]
@pytest.mark.requires("tiledb-vector-search")
def test_tiledb_ivf_flat_updates(tmp_path: Path) -> None:
"""Test end to end construction and search."""
dimensions = 10
index_uri = str(tmp_path)
embedding = ConsistentFakeEmbeddings(dimensionality=dimensions)
TileDB.create(
index_uri=index_uri,
index_type="IVF_FLAT",
dimensions=dimensions,
vector_type=np.dtype("float32"),
metadatas=False,
)
docsearch = TileDB.load(
index_uri=index_uri,
embedding=embedding,
)
output = docsearch.similarity_search("foo", k=2)
assert output == []
docsearch.add_texts(texts=["foo", "bar", "baz"], ids=["1", "2", "3"])
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
docsearch.delete(["1", "3"])
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="bar")]
output = docsearch.similarity_search("baz", k=1)
assert output == [Document(page_content="bar")]
docsearch.add_texts(texts=["fooo", "bazz"], ids=["4", "5"])
output = docsearch.similarity_search("fooo", k=1)
assert output == [Document(page_content="fooo")]
output = docsearch.similarity_search("bazz", k=1)
assert output == [Document(page_content="bazz")]
docsearch.consolidate_updates()
output = docsearch.similarity_search("fooo", k=1)
assert output == [Document(page_content="fooo")]
output = docsearch.similarity_search("bazz", k=1)
assert output == [Document(page_content="bazz")]

@ -1145,6 +1145,7 @@ def test_compatible_vectorstore_documentation() -> None:
"ScaNN",
"SemaDB",
"SupabaseVectorStore",
"TileDB",
"TimescaleVector",
"Vald",
"Vearch",

@ -52,6 +52,7 @@ _EXPECTED = [
"StarRocks",
"SupabaseVectorStore",
"Tair",
"TileDB",
"Tigris",
"TimescaleVector",
"Typesense",

Loading…
Cancel
Save