Harrison/tair (#3770)

Co-authored-by: Seth Huang <848849+seth-hg@users.noreply.github.com>
fix_agent_callbacks
Harrison Chase 1 year ago committed by GitHub
parent 502ba6a0be
commit 0c0f14407c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,22 @@
# Tair
This page covers how to use the Tair ecosystem within LangChain.
## Installation and Setup
Install Tair Python SDK with `pip install tair`.
## Wrappers
### VectorStore
There exists a wrapper around TairVector, allowing you to use it as a vectorstore,
whether for semantic search or example selection.
To import this vectorstore:
```python
from langchain.vectorstores import Tair
```
For a more detailed walkthrough of the Tair wrapper, see [this notebook](../modules/indexes/vectorstores/examples/tair.ipynb)

@ -0,0 +1,129 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tair\n",
"\n",
"This notebook shows how to use functionality related to the Tair vector database.\n",
"To run, you should have an [Tair](https://www.alibabacloud.com/help/en/tair/latest/what-is-tair) instance up and running."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings.fake import FakeEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import Tair"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = FakeEmbeddings(size=128)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Connect to Tair using the `TAIR_URL` environment variable \n",
"```\n",
"export TAIR_URL=\"redis://{username}:{password}@{tair_address}:{tair_port}\"\n",
"```\n",
"\n",
"or the keyword argument `tair_url`.\n",
"\n",
"Then store documents and embeddings into Tair."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"tair_url = \"redis://localhost:6379\"\n",
"\n",
"# drop first if index already exists\n",
"Tair.drop_index(tair_url=tair_url)\n",
"\n",
"vector_store = Tair.from_documents(\n",
" docs,\n",
" embeddings,\n",
" tair_url=tair_url\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Query similar documents."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='Were going after the criminals who stole billions in relief money meant for small businesses and millions of Americans. \\n\\nAnd tonight, Im announcing that the Justice Department will name a chief prosecutor for pandemic fraud. \\n\\nBy the end of this year, the deficit will be down to less than half what it was before I took office. \\n\\nThe only president ever to cut the deficit by more than one trillion dollars in a single year. \\n\\nLowering your costs also means demanding more competition. \\n\\nIm a capitalist, but capitalism without competition isnt capitalism. \\n\\nIts exploitation—and it drives up prices. \\n\\nWhen corporations dont have to compete, their profits go up, your prices go up, and small businesses and family farmers and ranchers go under. \\n\\nWe see it happening with ocean carriers moving goods in and out of America. \\n\\nDuring the pandemic, these foreign-owned companies raised prices by as much as 1,000% and made record profits.', metadata={'source': '../../../state_of_the_union.txt'})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = vector_store.similarity_search(query)\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

@ -15,6 +15,7 @@ from langchain.vectorstores.pinecone import Pinecone
from langchain.vectorstores.qdrant import Qdrant
from langchain.vectorstores.redis import Redis
from langchain.vectorstores.supabase import SupabaseVectorStore
from langchain.vectorstores.tair import Tair
from langchain.vectorstores.weaviate import Weaviate
from langchain.vectorstores.zilliz import Zilliz
@ -37,5 +38,6 @@ __all__ = [
"MyScaleSettings",
"SupabaseVectorStore",
"AnalyticDB",
"Tair",
"LanceDB",
]

@ -0,0 +1,286 @@
"""Wrapper around Tair Vector."""
from __future__ import annotations
import json
import logging
import uuid
from typing import Any, Iterable, List, Optional, Type
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.utils import get_from_dict_or_env
from langchain.vectorstores.base import VectorStore
logger = logging.getLogger(__name__)
def _uuid_key() -> str:
return uuid.uuid4().hex
class Tair(VectorStore):
def __init__(
self,
embedding_function: Embeddings,
url: str,
index_name: str,
content_key: str = "content",
metadata_key: str = "metadata",
search_params: Optional[dict] = None,
**kwargs: Any,
):
self.embedding_function = embedding_function
self.index_name = index_name
try:
from tair import Tair as TairClient
except ImportError:
raise ValueError(
"Could not import tair python package. "
"Please install it with `pip install tair`."
)
try:
# connect to tair from url
client = TairClient.from_url(url, **kwargs)
except ValueError as e:
raise ValueError(f"Tair failed to connect: {e}")
self.client = client
self.content_key = content_key
self.metadata_key = metadata_key
self.search_params = search_params
def create_index_if_not_exist(
self,
dim: int,
distance_type: str,
index_type: str,
data_type: str,
**kwargs: Any,
) -> bool:
index = self.client.tvs_get_index(self.index_name)
if index is not None:
logger.info("Index already exists")
return False
self.client.tvs_create_index(
self.index_name,
dim,
distance_type,
index_type,
data_type,
**kwargs,
)
return True
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Add texts data to an existing index."""
ids = []
keys = kwargs.get("keys", None)
# Write data to tair
pipeline = self.client.pipeline(transaction=False)
embeddings = self.embedding_function.embed_documents(list(texts))
for i, text in enumerate(texts):
# Use provided key otherwise use default key
key = keys[i] if keys else _uuid_key()
metadata = metadatas[i] if metadatas else {}
pipeline.tvs_hset(
self.index_name,
key,
embeddings[i],
False,
**{
self.content_key: text,
self.metadata_key: json.dumps(metadata),
},
)
ids.append(key)
pipeline.execute()
return ids
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""
Returns the most similar indexed documents to the query text.
Args:
query (str): The query text for which to find similar documents.
k (int): The number of documents to return. Default is 4.
Returns:
List[Document]: A list of documents that are most similar to the query text.
"""
# Creates embedding vector from user query
embedding = self.embedding_function.embed_query(query)
keys_and_scores = self.client.tvs_knnsearch(
self.index_name, k, embedding, False, None, **kwargs
)
pipeline = self.client.pipeline(transaction=False)
for key, _ in keys_and_scores:
pipeline.tvs_hmget(
self.index_name, key, self.metadata_key, self.content_key
)
docs = pipeline.execute()
return [
Document(
page_content=d[1],
metadata=json.loads(d[0]),
)
for d in docs
]
@classmethod
def from_texts(
cls: Type[Tair],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
index_name: str = "langchain",
content_key: str = "content",
metadata_key: str = "metadata",
**kwargs: Any,
) -> Tair:
try:
from tair import tairvector
except ImportError:
raise ValueError(
"Could not import tair python package. "
"Please install it with `pip install tair`."
)
url = get_from_dict_or_env(kwargs, "tair_url", "TAIR_URL")
if "tair_url" in kwargs:
kwargs.pop("tair_url")
distance_type = tairvector.DistanceMetric.InnerProduct
if "distance_type" in kwargs:
distance_type = kwargs.pop("distance_typ")
index_type = tairvector.IndexType.HNSW
if "index_type" in kwargs:
index_type = kwargs.pop("index_type")
data_type = tairvector.DataType.Float32
if "data_type" in kwargs:
data_type = kwargs.pop("data_type")
index_params = {}
if "index_params" in kwargs:
index_params = kwargs.pop("index_params")
search_params = {}
if "search_params" in kwargs:
search_params = kwargs.pop("search_params")
keys = None
if "keys" in kwargs:
keys = kwargs.pop("keys")
try:
tair_vector_store = cls(
embedding,
url,
index_name,
content_key=content_key,
metadata_key=metadata_key,
search_params=search_params,
**kwargs,
)
except ValueError as e:
raise ValueError(f"tair failed to connect: {e}")
# Create embeddings for documents
embeddings = embedding.embed_documents(texts)
tair_vector_store.create_index_if_not_exist(
len(embeddings[0]),
distance_type,
index_type,
data_type,
**index_params,
)
tair_vector_store.add_texts(texts, metadatas, keys=keys)
return tair_vector_store
@classmethod
def from_documents(
cls,
documents: List[Document],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
index_name: str = "langchain",
content_key: str = "content",
metadata_key: str = "metadata",
**kwargs: Any,
) -> Tair:
texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]
return cls.from_texts(
texts, embedding, metadatas, index_name, content_key, metadata_key, **kwargs
)
@staticmethod
def drop_index(
index_name: str = "langchain",
**kwargs: Any,
) -> bool:
"""
Drop an existing index.
Args:
index_name (str): Name of the index to drop.
Returns:
bool: True if the index is dropped successfully.
"""
try:
from tair import Tair as TairClient
except ImportError:
raise ValueError(
"Could not import tair python package. "
"Please install it with `pip install tair`."
)
url = get_from_dict_or_env(kwargs, "tair_url", "TAIR_URL")
try:
if "tair_url" in kwargs:
kwargs.pop("tair_url")
client = TairClient.from_url(url=url, **kwargs)
except ValueError as e:
raise ValueError(f"Tair connection error: {e}")
# delete index
ret = client.tvs_del_index(index_name)
if ret == 0:
# index not exist
logger.info("Index does not exist")
return False
return True
@classmethod
def from_existing_index(
cls,
embedding: Embeddings,
index_name: str = "langchain",
content_key: str = "content",
metadata_key: str = "metadata",
**kwargs: Any,
) -> Tair:
"""Connect to an existing Tair index."""
url = get_from_dict_or_env(kwargs, "tair_url", "TAIR_URL")
search_params = {}
if "search_params" in kwargs:
search_params = kwargs.pop("search_params")
return cls(
embedding,
url,
index_name,
content_key=content_key,
metadata_key=metadata_key,
search_params=search_params,
**kwargs,
)

@ -126,6 +126,7 @@ python-dotenv = "^1.0.0"
sentence-transformers = "^2"
gptcache = "^0.1.9"
promptlayer = "^0.1.80"
tair = "^1.3.3"
[tool.poetry.group.lint.dependencies]
ruff = "^0.0.249"

@ -0,0 +1,15 @@
"""Test tair functionality."""
from langchain.docstore.document import Document
from langchain.vectorstores.tair import Tair
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
def test_tair() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = Tair.from_texts(
texts, FakeEmbeddings(), tair_url="redis://localhost:6379"
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
Loading…
Cancel
Save