feat: Support for Tigris Vector Database for vector search (#5703)

### Changes
- New vector store integration - [Tigris](https://tigrisdata.com)
- Adds [tigrisdb](https://pypi.org/project/tigrisdb/) optional
dependency
- Example notebook demonstrating usage

Fixes #5535 
Closes tigrisdata/tigris-client-python#40

#### Twitter handles
We'd love a shoutout on our
[@TigrisData](https://twitter.com/TigrisData) and
[@adilansari](https://twitter.com/adilansari) twitter handles

#### Who can review?
@dev2049

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Adil Ansari 2023-06-05 20:39:16 -07:00 committed by GitHub
parent 38dabdbb3a
commit 233b52735e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 831 additions and 21 deletions

View File

@ -0,0 +1,199 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Tigris\n",
"\n",
"> [Tigris](htttps://tigrisdata.com) is an open source Serverless NoSQL Database and Search Platform designed to simplify building high-performance vector search applications.\n",
"> Tigris eliminates the infrastructure complexity of managing, operating, and synchronizing multiple tools, allowing you to focus on building great applications instead."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"This notebook guides you how to use Tigris as your VectorStore"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"**Pre requisites**\n",
"1. An OpenAI account. You can sign up for an account [here](https://platform.openai.com/)\n",
"2. [Sign up for a free Tigris account](https://console.preview.tigrisdata.cloud). Once you have signed up for the Tigris account, create a new project called `vectordemo`. Next, make a note of the *Uri* for the region you've created your project in, the **clientId** and **clientSecret**. You can get all this information from the **Application Keys** section of the project."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Let's first install our dependencies:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"!pip install tigrisdb openapi-schema-pydantic openai tiktoken"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"We will load the `OpenAI` api key and `Tigris` credentials in our environment"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import os\n",
"import getpass\n",
"\n",
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
"os.environ['TIGRIS_PROJECT'] = getpass.getpass('Tigris Project Name:')\n",
"os.environ['TIGRIS_CLIENT_ID'] = getpass.getpass('Tigris Client Id:')\n",
"os.environ['TIGRIS_CLIENT_SECRET'] = getpass.getpass('Tigris Client Secret:')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import Tigris\n",
"from langchain.document_loaders import TextLoader"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Initialize Tigris vector store\n",
"Let's import our test dataset:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"loader = TextLoader('../../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"vector_store = Tigris.from_documents(docs, embeddings, index_name=\"my_embeddings\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Similarity Search"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"found_docs = vector_store.similarity_search(query)\n",
"print(found_docs)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Similarity Search with score (vector distance)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"result = vector_store.similarity_search_with_score(query)\n",
"for (doc, score) in result:\n",
" print(f\"document={doc}, score={score}\")"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -20,6 +20,7 @@ from langchain.vectorstores.redis import Redis
from langchain.vectorstores.sklearn import SKLearnVectorStore from langchain.vectorstores.sklearn import SKLearnVectorStore
from langchain.vectorstores.supabase import SupabaseVectorStore from langchain.vectorstores.supabase import SupabaseVectorStore
from langchain.vectorstores.tair import Tair from langchain.vectorstores.tair import Tair
from langchain.vectorstores.tigris import Tigris
from langchain.vectorstores.typesense import Typesense from langchain.vectorstores.typesense import Typesense
from langchain.vectorstores.vectara import Vectara from langchain.vectorstores.vectara import Vectara
from langchain.vectorstores.weaviate import Weaviate from langchain.vectorstores.weaviate import Weaviate
@ -54,4 +55,5 @@ __all__ = [
"Typesense", "Typesense",
"Clickhouse", "Clickhouse",
"ClickhouseSettings", "ClickhouseSettings",
"Tigris",
] ]

View File

@ -0,0 +1,142 @@
from __future__ import annotations
import itertools
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple
from langchain.embeddings.base import Embeddings
from langchain.schema import Document
from langchain.vectorstores import VectorStore
if TYPE_CHECKING:
from tigrisdb import TigrisClient
from tigrisdb import VectorStore as TigrisVectorStore
from tigrisdb.types.filters import Filter as TigrisFilter
from tigrisdb.types.vector import Document as TigrisDocument
class Tigris(VectorStore):
def __init__(self, client: TigrisClient, embeddings: Embeddings, index_name: str):
"""Initialize Tigris vector store"""
try:
import tigrisdb # noqa: F401
except ImportError:
raise ValueError(
"Could not import tigrisdb python package. "
"Please install it with `pip install tigrisdb`"
)
self._embed_fn = embeddings
self._vector_store = TigrisVectorStore(client.get_search(), index_name)
@property
def search_index(self) -> TigrisVectorStore:
return self._vector_store
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids for documents.
Ids will be autogenerated if not provided.
kwargs: vectorstore specific parameters
Returns:
List of ids from adding the texts into the vectorstore.
"""
docs = self._prep_docs(texts, metadatas, ids)
result = self.search_index.add_documents(docs)
return [r.id for r in result]
def similarity_search(
self,
query: str,
k: int = 4,
filter: Optional[TigrisFilter] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query."""
docs_with_scores = self.similarity_search_with_score(query, k, filter)
return [doc for doc, _ in docs_with_scores]
def similarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[TigrisFilter] = None,
) -> List[Tuple[Document, float]]:
"""Run similarity search with Chroma with distance.
Args:
query (str): Query text to search for.
k (int): Number of results to return. Defaults to 4.
filter (Optional[TigrisFilter]): Filter by metadata. Defaults to None.
Returns:
List[Tuple[Document, float]]: List of documents most similar to the query
text with distance in float.
"""
vector = self._embed_fn.embed_query(query)
result = self.search_index.similarity_search(
vector=vector, k=k, filter_by=filter
)
docs: List[Tuple[Document, float]] = []
for r in result:
docs.append(
(
Document(
page_content=r.doc["text"], metadata=r.doc.get("metadata")
),
r.score,
)
)
return docs
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
client: Optional[TigrisClient] = None,
index_name: Optional[str] = None,
**kwargs: Any,
) -> Tigris:
"""Return VectorStore initialized from texts and embeddings."""
if not index_name:
raise ValueError("`index_name` is required")
if not client:
client = TigrisClient()
store = cls(client, embedding, index_name)
store.add_texts(texts=texts, metadatas=metadatas, ids=ids)
return store
def _prep_docs(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]],
ids: Optional[List[str]],
) -> List[TigrisDocument]:
embeddings: List[List[float]] = self._embed_fn.embed_documents(list(texts))
docs: List[TigrisDocument] = []
for t, m, e, _id in itertools.zip_longest(
texts, metadatas or [], embeddings or [], ids or []
):
doc: TigrisDocument = {
"text": t,
"embeddings": e or [],
"metadata": m or {},
}
if _id:
doc["id"] = _id
docs.append(doc)
return docs

505
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -103,6 +103,7 @@ py-trello = {version = "^0.19.0", optional = true}
momento = {version = "^1.5.0", optional = true} momento = {version = "^1.5.0", optional = true}
bibtexparser = {version = "^1.4.0", optional = true} bibtexparser = {version = "^1.4.0", optional = true}
pyspark = {version = "^3.4.0", optional = true} pyspark = {version = "^3.4.0", optional = true}
tigrisdb = {version = "^1.0.0b6", optional = true}
[tool.poetry.group.docs.dependencies] [tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0" autodoc_pydantic = "^1.8.0"
@ -278,7 +279,8 @@ all = [
"azure-ai-formrecognizer", "azure-ai-formrecognizer",
"azure-ai-vision", "azure-ai-vision",
"azure-cognitiveservices-speech", "azure-cognitiveservices-speech",
"momento" "momento",
"tigrisdb"
] ]
# An extra used to be able to add extended testing. # An extra used to be able to add extended testing.