Add Typesense vector store (#1674)

Closes #931.

---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
pull/3553/head^2
Jason Bosco 1 year ago committed by GitHub
parent 33929489b9
commit 9c4b43b494
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Typesense\n",
"\n",
"> [Typesense](https://typesense.org) is an open source, in-memory search engine, that you can either [self-host](https://typesense.org/docs/guide/install-typesense.html#option-2-local-machine-self-hosting) or run on [Typesense Cloud](https://cloud.typesense.org/).\n",
">\n",
"> Typesense focuses on performance by storing the entire index in RAM (with a backup on disk) and also focuses on providing an out-of-the-box developer experience by simplifying available options and setting good defaults.\n",
">\n",
"> It also lets you combine attribute-based filtering together with vector queries, to fetch the most relevant documents."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"This notebook shows you how to use Typesense as your VectorStore."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Let's first install our dependencies:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"!pip install typesense openapi-schema-pydantic openai tiktoken"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"import os\n",
"import getpass\n",
"\n",
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-23T22:48:02.968822Z",
"start_time": "2023-05-23T22:47:48.574094Z"
}
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import Typesense\n",
"from langchain.document_loaders import TextLoader"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-23T22:50:34.775893Z",
"start_time": "2023-05-23T22:50:34.771889Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"Let's import our test dataset:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [],
"source": [
"loader = TextLoader('../../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-05-23T22:56:19.093489Z",
"start_time": "2023-05-23T22:56:19.089Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"docsearch = Typesense.from_documents(docs,\n",
" embeddings,\n",
" typesense_client_params={\n",
" 'host': 'localhost', # Use xxx.a1.typesense.net for Typesense Cloud\n",
" 'port': '8108', # Use 443 for Typesense Cloud\n",
" 'protocol': 'http', # Use https for Typesense Cloud\n",
" 'typesense_api_key': 'xyz',\n",
" 'typesense_collection_name': 'lang-chain'\n",
" })"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Similarity Search"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"found_docs = docsearch.similarity_search(query)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"print(found_docs[0].page_content)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Typesense as a Retriever\n",
"\n",
"Typesense, as all the other vector stores, is a LangChain Retriever, by using cosine similarity."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"retriever = docsearch.as_retriever()\n",
"retriever"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"retriever.get_relevant_documents(query)[0]"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

@ -17,6 +17,7 @@ from langchain.vectorstores.qdrant import Qdrant
from langchain.vectorstores.redis import Redis from langchain.vectorstores.redis import Redis
from langchain.vectorstores.supabase import SupabaseVectorStore from langchain.vectorstores.supabase import SupabaseVectorStore
from langchain.vectorstores.tair import Tair from langchain.vectorstores.tair import Tair
from langchain.vectorstores.typesense import Typesense
from langchain.vectorstores.weaviate import Weaviate from langchain.vectorstores.weaviate import Weaviate
from langchain.vectorstores.zilliz import Zilliz from langchain.vectorstores.zilliz import Zilliz
@ -43,4 +44,5 @@ __all__ = [
"LanceDB", "LanceDB",
"DocArrayHnswSearch", "DocArrayHnswSearch",
"DocArrayInMemorySearch", "DocArrayInMemorySearch",
"Typesense",
] ]

@ -0,0 +1,270 @@
"""Wrapper around Typesense vector search"""
from __future__ import annotations
import uuid
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.utils import get_from_env
from langchain.vectorstores.base import VectorStore
if TYPE_CHECKING:
from typesense.client import Client
from typesense.collection import Collection
class Typesense(VectorStore):
"""Wrapper around Typesense vector search.
To use, you should have the ``typesense`` python package installed.
Example:
.. code-block:: python
from langchain.embedding.openai import OpenAIEmbeddings
from langchain.vectorstores import Typesense
import typesense
node = {
"host": "localhost", # For Typesense Cloud use xxx.a1.typesense.net
"port": "8108", # For Typesense Cloud use 443
"protocol": "http" # For Typesense Cloud use https
}
typesense_client = typesense.Client(
{
"nodes": [node],
"api_key": "<API_KEY>",
"connection_timeout_seconds": 2
}
)
typesense_collection_name = "langchain-memory"
embedding = OpenAIEmbeddings()
vectorstore = Typesense(
typesense_client,
typesense_collection_name,
embedding.embed_query,
"text",
)
"""
def __init__(
self,
typesense_client: Client,
embedding: Embeddings,
*,
typesense_collection_name: Optional[str] = None,
text_key: str = "text",
):
"""Initialize with Typesense client."""
try:
from typesense import Client
except ImportError:
raise ValueError(
"Could not import typesense python package. "
"Please install it with `pip install typesense`."
)
if not isinstance(typesense_client, Client):
raise ValueError(
f"typesense_client should be an instance of typesense.Client, "
f"got {type(typesense_client)}"
)
self._typesense_client = typesense_client
self._embedding = embedding
self._typesense_collection_name = (
typesense_collection_name or f"langchain-{str(uuid.uuid4())}"
)
self._text_key = text_key
@property
def _collection(self) -> Collection:
return self._typesense_client.collections[self._typesense_collection_name]
def _prep_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]],
ids: Optional[List[str]],
) -> List[dict]:
"""Embed and create the documents"""
_ids = ids or (str(uuid.uuid4()) for _ in texts)
_metadatas: Iterable[dict] = metadatas or ({} for _ in texts)
embedded_texts = self._embedding.embed_documents(list(texts))
return [
{"id": _id, "vec": vec, f"{self._text_key}": text, "metadata": metadata}
for _id, vec, text, metadata in zip(_ids, embedded_texts, texts, _metadatas)
]
def _create_collection(self, num_dim: int) -> None:
fields = [
{"name": "vec", "type": "float[]", "num_dim": num_dim},
{"name": f"{self._text_key}", "type": "string"},
{"name": ".*", "type": "auto"},
]
self._typesense_client.collections.create(
{"name": self._typesense_collection_name, "fields": fields}
)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embedding and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.
Returns:
List of ids from adding the texts into the vectorstore.
"""
from typesense.exceptions import ObjectNotFound
docs = self._prep_texts(texts, metadatas, ids)
try:
self._collection.documents.import_(docs, {"action": "upsert"})
except ObjectNotFound:
# Create the collection if it doesn't already exist
self._create_collection(len(docs[0]["vec"]))
self._collection.documents.import_(docs, {"action": "upsert"})
return [doc["id"] for doc in docs]
def similarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[str] = "",
) -> List[Tuple[Document, float]]:
"""Return typesense documents most similar to query, along with scores.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: typesense filter_by expression to filter documents on
Returns:
List of Documents most similar to the query and score for each
"""
embedded_query = [str(x) for x in self._embedding.embed_query(query)]
query_obj = {
"q": "*",
"vector_query": f'vec:([{",".join(embedded_query)}], k:{k})',
"filter_by": filter,
"collection": self._typesense_collection_name,
}
docs = []
response = self._typesense_client.multi_search.perform(
{"searches": [query_obj]}, {}
)
for hit in response["results"][0]["hits"]:
document = hit["document"]
metadata = document["metadata"]
text = document[self._text_key]
score = hit["vector_distance"]
docs.append((Document(page_content=text, metadata=metadata), score))
return docs
def similarity_search(
self,
query: str,
k: int = 4,
filter: Optional[str] = "",
**kwargs: Any,
) -> List[Document]:
"""Return typesense documents most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: typesense filter_by expression to filter documents on
Returns:
List of Documents most similar to the query and score for each
"""
docs_and_score = self.similarity_search_with_score(query, k=k, filter=filter)
return [doc for doc, _ in docs_and_score]
@classmethod
def from_client_params(
cls,
embedding: Embeddings,
*,
host: str = "localhost",
port: Union[str, int] = "8108",
protocol: str = "http",
typesense_api_key: Optional[str] = None,
connection_timeout_seconds: int = 2,
**kwargs: Any,
) -> Typesense:
"""Initialize Typesense directly from client parameters.
Example:
.. code-block:: python
from langchain.embedding.openai import OpenAIEmbeddings
from langchain.vectorstores import Typesense
# Pass in typesense_api_key as kwarg or set env var "TYPESENSE_API_KEY".
vectorstore = Typesense(
OpenAIEmbeddings(),
host="localhost",
port="8108",
protocol="http",
typesense_collection_name="langchain-memory",
)
"""
try:
from typesense import Client
except ImportError:
raise ValueError(
"Could not import typesense python package. "
"Please install it with `pip install typesense`."
)
node = {
"host": host,
"port": str(port),
"protocol": protocol,
}
typesense_api_key = typesense_api_key or get_from_env(
"typesense_api_key", "TYPESENSE_API_KEY"
)
client_config = {
"nodes": [node],
"api_key": typesense_api_key,
"connection_timeout_seconds": connection_timeout_seconds,
}
return cls(Client(client_config), embedding, **kwargs)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
typesense_client: Optional[Client] = None,
typesense_client_params: Optional[dict] = None,
typesense_collection_name: Optional[str] = None,
text_key: str = "text",
**kwargs: Any,
) -> Typesense:
"""Construct Typesense wrapper from raw text."""
if typesense_client:
vectorstore = cls(typesense_client, embedding, **kwargs)
elif typesense_client_params:
vectorstore = cls.from_client_params(
embedding, **typesense_client_params, **kwargs
)
else:
raise ValueError(
"Must specify one of typesense_client or typesense_client_params."
)
vectorstore.add_texts(texts, metadatas=metadatas, ids=ids)
return vectorstore
Loading…
Cancel
Save