mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
feat: Support for Tigris
Vector Database for vector search (#5703)
### Changes - New vector store integration - [Tigris](https://tigrisdata.com) - Adds [tigrisdb](https://pypi.org/project/tigrisdb/) optional dependency - Example notebook demonstrating usage Fixes #5535 Closes tigrisdata/tigris-client-python#40 #### Twitter handles We'd love a shoutout on our [@TigrisData](https://twitter.com/TigrisData) and [@adilansari](https://twitter.com/adilansari) twitter handles #### Who can review? @dev2049 --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
38dabdbb3a
commit
233b52735e
199
docs/modules/indexes/vectorstores/examples/tigris.ipynb
Normal file
199
docs/modules/indexes/vectorstores/examples/tigris.ipynb
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# Tigris\n",
|
||||||
|
"\n",
|
||||||
|
"> [Tigris](htttps://tigrisdata.com) is an open source Serverless NoSQL Database and Search Platform designed to simplify building high-performance vector search applications.\n",
|
||||||
|
"> Tigris eliminates the infrastructure complexity of managing, operating, and synchronizing multiple tools, allowing you to focus on building great applications instead."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"This notebook guides you how to use Tigris as your VectorStore"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"**Pre requisites**\n",
|
||||||
|
"1. An OpenAI account. You can sign up for an account [here](https://platform.openai.com/)\n",
|
||||||
|
"2. [Sign up for a free Tigris account](https://console.preview.tigrisdata.cloud). Once you have signed up for the Tigris account, create a new project called `vectordemo`. Next, make a note of the *Uri* for the region you've created your project in, the **clientId** and **clientSecret**. You can get all this information from the **Application Keys** section of the project."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Let's first install our dependencies:"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install tigrisdb openapi-schema-pydantic openai tiktoken"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"We will load the `OpenAI` api key and `Tigris` credentials in our environment"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import getpass\n",
|
||||||
|
"\n",
|
||||||
|
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
|
||||||
|
"os.environ['TIGRIS_PROJECT'] = getpass.getpass('Tigris Project Name:')\n",
|
||||||
|
"os.environ['TIGRIS_CLIENT_ID'] = getpass.getpass('Tigris Client Id:')\n",
|
||||||
|
"os.environ['TIGRIS_CLIENT_SECRET'] = getpass.getpass('Tigris Client Secret:')"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||||
|
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||||
|
"from langchain.vectorstores import Tigris\n",
|
||||||
|
"from langchain.document_loaders import TextLoader"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"### Initialize Tigris vector store\n",
|
||||||
|
"Let's import our test dataset:"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = TextLoader('../../../state_of_the_union.txt')\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||||
|
"docs = text_splitter.split_documents(documents)\n",
|
||||||
|
"\n",
|
||||||
|
"embeddings = OpenAIEmbeddings()"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"vector_store = Tigris.from_documents(docs, embeddings, index_name=\"my_embeddings\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"### Similarity Search"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
|
"found_docs = vector_store.similarity_search(query)\n",
|
||||||
|
"print(found_docs)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"### Similarity Search with score (vector distance)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
|
"result = vector_store.similarity_search_with_score(query)\n",
|
||||||
|
"for (doc, score) in result:\n",
|
||||||
|
" print(f\"document={doc}, score={score}\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
@ -20,6 +20,7 @@ from langchain.vectorstores.redis import Redis
|
|||||||
from langchain.vectorstores.sklearn import SKLearnVectorStore
|
from langchain.vectorstores.sklearn import SKLearnVectorStore
|
||||||
from langchain.vectorstores.supabase import SupabaseVectorStore
|
from langchain.vectorstores.supabase import SupabaseVectorStore
|
||||||
from langchain.vectorstores.tair import Tair
|
from langchain.vectorstores.tair import Tair
|
||||||
|
from langchain.vectorstores.tigris import Tigris
|
||||||
from langchain.vectorstores.typesense import Typesense
|
from langchain.vectorstores.typesense import Typesense
|
||||||
from langchain.vectorstores.vectara import Vectara
|
from langchain.vectorstores.vectara import Vectara
|
||||||
from langchain.vectorstores.weaviate import Weaviate
|
from langchain.vectorstores.weaviate import Weaviate
|
||||||
@ -54,4 +55,5 @@ __all__ = [
|
|||||||
"Typesense",
|
"Typesense",
|
||||||
"Clickhouse",
|
"Clickhouse",
|
||||||
"ClickhouseSettings",
|
"ClickhouseSettings",
|
||||||
|
"Tigris",
|
||||||
]
|
]
|
||||||
|
142
langchain/vectorstores/tigris.py
Normal file
142
langchain/vectorstores/tigris.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from langchain.schema import Document
|
||||||
|
from langchain.vectorstores import VectorStore
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from tigrisdb import TigrisClient
|
||||||
|
from tigrisdb import VectorStore as TigrisVectorStore
|
||||||
|
from tigrisdb.types.filters import Filter as TigrisFilter
|
||||||
|
from tigrisdb.types.vector import Document as TigrisDocument
|
||||||
|
|
||||||
|
|
||||||
|
class Tigris(VectorStore):
|
||||||
|
def __init__(self, client: TigrisClient, embeddings: Embeddings, index_name: str):
|
||||||
|
"""Initialize Tigris vector store"""
|
||||||
|
try:
|
||||||
|
import tigrisdb # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"Could not import tigrisdb python package. "
|
||||||
|
"Please install it with `pip install tigrisdb`"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._embed_fn = embeddings
|
||||||
|
self._vector_store = TigrisVectorStore(client.get_search(), index_name)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def search_index(self) -> TigrisVectorStore:
|
||||||
|
return self._vector_store
|
||||||
|
|
||||||
|
def add_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: Iterable of strings to add to the vectorstore.
|
||||||
|
metadatas: Optional list of metadatas associated with the texts.
|
||||||
|
ids: Optional list of ids for documents.
|
||||||
|
Ids will be autogenerated if not provided.
|
||||||
|
kwargs: vectorstore specific parameters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ids from adding the texts into the vectorstore.
|
||||||
|
"""
|
||||||
|
docs = self._prep_docs(texts, metadatas, ids)
|
||||||
|
result = self.search_index.add_documents(docs)
|
||||||
|
return [r.id for r in result]
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
filter: Optional[TigrisFilter] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to query."""
|
||||||
|
docs_with_scores = self.similarity_search_with_score(query, k, filter)
|
||||||
|
return [doc for doc, _ in docs_with_scores]
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
filter: Optional[TigrisFilter] = None,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Run similarity search with Chroma with distance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): Query text to search for.
|
||||||
|
k (int): Number of results to return. Defaults to 4.
|
||||||
|
filter (Optional[TigrisFilter]): Filter by metadata. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Tuple[Document, float]]: List of documents most similar to the query
|
||||||
|
text with distance in float.
|
||||||
|
"""
|
||||||
|
vector = self._embed_fn.embed_query(query)
|
||||||
|
result = self.search_index.similarity_search(
|
||||||
|
vector=vector, k=k, filter_by=filter
|
||||||
|
)
|
||||||
|
docs: List[Tuple[Document, float]] = []
|
||||||
|
for r in result:
|
||||||
|
docs.append(
|
||||||
|
(
|
||||||
|
Document(
|
||||||
|
page_content=r.doc["text"], metadata=r.doc.get("metadata")
|
||||||
|
),
|
||||||
|
r.score,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_texts(
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Embeddings,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
client: Optional[TigrisClient] = None,
|
||||||
|
index_name: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Tigris:
|
||||||
|
"""Return VectorStore initialized from texts and embeddings."""
|
||||||
|
if not index_name:
|
||||||
|
raise ValueError("`index_name` is required")
|
||||||
|
|
||||||
|
if not client:
|
||||||
|
client = TigrisClient()
|
||||||
|
store = cls(client, embedding, index_name)
|
||||||
|
store.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
||||||
|
return store
|
||||||
|
|
||||||
|
def _prep_docs(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[dict]],
|
||||||
|
ids: Optional[List[str]],
|
||||||
|
) -> List[TigrisDocument]:
|
||||||
|
embeddings: List[List[float]] = self._embed_fn.embed_documents(list(texts))
|
||||||
|
docs: List[TigrisDocument] = []
|
||||||
|
for t, m, e, _id in itertools.zip_longest(
|
||||||
|
texts, metadatas or [], embeddings or [], ids or []
|
||||||
|
):
|
||||||
|
doc: TigrisDocument = {
|
||||||
|
"text": t,
|
||||||
|
"embeddings": e or [],
|
||||||
|
"metadata": m or {},
|
||||||
|
}
|
||||||
|
if _id:
|
||||||
|
doc["id"] = _id
|
||||||
|
docs.append(doc)
|
||||||
|
return docs
|
505
poetry.lock
generated
505
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -103,6 +103,7 @@ py-trello = {version = "^0.19.0", optional = true}
|
|||||||
momento = {version = "^1.5.0", optional = true}
|
momento = {version = "^1.5.0", optional = true}
|
||||||
bibtexparser = {version = "^1.4.0", optional = true}
|
bibtexparser = {version = "^1.4.0", optional = true}
|
||||||
pyspark = {version = "^3.4.0", optional = true}
|
pyspark = {version = "^3.4.0", optional = true}
|
||||||
|
tigrisdb = {version = "^1.0.0b6", optional = true}
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
autodoc_pydantic = "^1.8.0"
|
autodoc_pydantic = "^1.8.0"
|
||||||
@ -278,7 +279,8 @@ all = [
|
|||||||
"azure-ai-formrecognizer",
|
"azure-ai-formrecognizer",
|
||||||
"azure-ai-vision",
|
"azure-ai-vision",
|
||||||
"azure-cognitiveservices-speech",
|
"azure-cognitiveservices-speech",
|
||||||
"momento"
|
"momento",
|
||||||
|
"tigrisdb"
|
||||||
]
|
]
|
||||||
|
|
||||||
# An extra used to be able to add extended testing.
|
# An extra used to be able to add extended testing.
|
||||||
|
Loading…
Reference in New Issue
Block a user