mirror of https://github.com/hwchase17/langchain
Add Typesense vector store (#1674)
Closes #931. --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>pull/3553/head^2
parent
33929489b9
commit
9c4b43b494
@ -0,0 +1,229 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# Typesense\n",
|
||||||
|
"\n",
|
||||||
|
"> [Typesense](https://typesense.org) is an open source, in-memory search engine, that you can either [self-host](https://typesense.org/docs/guide/install-typesense.html#option-2-local-machine-self-hosting) or run on [Typesense Cloud](https://cloud.typesense.org/).\n",
|
||||||
|
">\n",
|
||||||
|
"> Typesense focuses on performance by storing the entire index in RAM (with a backup on disk) and also focuses on providing an out-of-the-box developer experience by simplifying available options and setting good defaults.\n",
|
||||||
|
">\n",
|
||||||
|
"> It also lets you combine attribute-based filtering together with vector queries, to fetch the most relevant documents."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"This notebook shows you how to use Typesense as your VectorStore."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Let's first install our dependencies:"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install typesense openapi-schema-pydantic openai tiktoken"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import getpass\n",
|
||||||
|
"\n",
|
||||||
|
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-05-23T22:48:02.968822Z",
|
||||||
|
"start_time": "2023-05-23T22:47:48.574094Z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||||
|
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||||
|
"from langchain.vectorstores import Typesense\n",
|
||||||
|
"from langchain.document_loaders import TextLoader"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-05-23T22:50:34.775893Z",
|
||||||
|
"start_time": "2023-05-23T22:50:34.771889Z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Let's import our test dataset:"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = TextLoader('../../../state_of_the_union.txt')\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||||
|
"docs = text_splitter.split_documents(documents)\n",
|
||||||
|
"\n",
|
||||||
|
"embeddings = OpenAIEmbeddings()"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-05-23T22:56:19.093489Z",
|
||||||
|
"start_time": "2023-05-23T22:56:19.089Z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docsearch = Typesense.from_documents(docs,\n",
|
||||||
|
" embeddings,\n",
|
||||||
|
" typesense_client_params={\n",
|
||||||
|
" 'host': 'localhost', # Use xxx.a1.typesense.net for Typesense Cloud\n",
|
||||||
|
" 'port': '8108', # Use 443 for Typesense Cloud\n",
|
||||||
|
" 'protocol': 'http', # Use https for Typesense Cloud\n",
|
||||||
|
" 'typesense_api_key': 'xyz',\n",
|
||||||
|
" 'typesense_collection_name': 'lang-chain'\n",
|
||||||
|
" })"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Similarity Search"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
|
"found_docs = docsearch.similarity_search(query)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(found_docs[0].page_content)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Typesense as a Retriever\n",
|
||||||
|
"\n",
|
||||||
|
"Typesense, as all the other vector stores, is a LangChain Retriever, by using cosine similarity."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"retriever = docsearch.as_retriever()\n",
|
||||||
|
"retriever"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||||
|
"retriever.get_relevant_documents(query)[0]"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
@ -0,0 +1,270 @@
|
|||||||
|
"""Wrapper around Typesense vector search"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from langchain.utils import get_from_env
|
||||||
|
from langchain.vectorstores.base import VectorStore
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from typesense.client import Client
|
||||||
|
from typesense.collection import Collection
|
||||||
|
|
||||||
|
|
||||||
|
class Typesense(VectorStore):
|
||||||
|
"""Wrapper around Typesense vector search.
|
||||||
|
|
||||||
|
To use, you should have the ``typesense`` python package installed.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.embedding.openai import OpenAIEmbeddings
|
||||||
|
from langchain.vectorstores import Typesense
|
||||||
|
import typesense
|
||||||
|
|
||||||
|
node = {
|
||||||
|
"host": "localhost", # For Typesense Cloud use xxx.a1.typesense.net
|
||||||
|
"port": "8108", # For Typesense Cloud use 443
|
||||||
|
"protocol": "http" # For Typesense Cloud use https
|
||||||
|
}
|
||||||
|
typesense_client = typesense.Client(
|
||||||
|
{
|
||||||
|
"nodes": [node],
|
||||||
|
"api_key": "<API_KEY>",
|
||||||
|
"connection_timeout_seconds": 2
|
||||||
|
}
|
||||||
|
)
|
||||||
|
typesense_collection_name = "langchain-memory"
|
||||||
|
|
||||||
|
embedding = OpenAIEmbeddings()
|
||||||
|
vectorstore = Typesense(
|
||||||
|
typesense_client,
|
||||||
|
typesense_collection_name,
|
||||||
|
embedding.embed_query,
|
||||||
|
"text",
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
typesense_client: Client,
|
||||||
|
embedding: Embeddings,
|
||||||
|
*,
|
||||||
|
typesense_collection_name: Optional[str] = None,
|
||||||
|
text_key: str = "text",
|
||||||
|
):
|
||||||
|
"""Initialize with Typesense client."""
|
||||||
|
try:
|
||||||
|
from typesense import Client
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"Could not import typesense python package. "
|
||||||
|
"Please install it with `pip install typesense`."
|
||||||
|
)
|
||||||
|
if not isinstance(typesense_client, Client):
|
||||||
|
raise ValueError(
|
||||||
|
f"typesense_client should be an instance of typesense.Client, "
|
||||||
|
f"got {type(typesense_client)}"
|
||||||
|
)
|
||||||
|
self._typesense_client = typesense_client
|
||||||
|
self._embedding = embedding
|
||||||
|
self._typesense_collection_name = (
|
||||||
|
typesense_collection_name or f"langchain-{str(uuid.uuid4())}"
|
||||||
|
)
|
||||||
|
self._text_key = text_key
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _collection(self) -> Collection:
|
||||||
|
return self._typesense_client.collections[self._typesense_collection_name]
|
||||||
|
|
||||||
|
def _prep_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[dict]],
|
||||||
|
ids: Optional[List[str]],
|
||||||
|
) -> List[dict]:
|
||||||
|
"""Embed and create the documents"""
|
||||||
|
_ids = ids or (str(uuid.uuid4()) for _ in texts)
|
||||||
|
_metadatas: Iterable[dict] = metadatas or ({} for _ in texts)
|
||||||
|
embedded_texts = self._embedding.embed_documents(list(texts))
|
||||||
|
return [
|
||||||
|
{"id": _id, "vec": vec, f"{self._text_key}": text, "metadata": metadata}
|
||||||
|
for _id, vec, text, metadata in zip(_ids, embedded_texts, texts, _metadatas)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _create_collection(self, num_dim: int) -> None:
|
||||||
|
fields = [
|
||||||
|
{"name": "vec", "type": "float[]", "num_dim": num_dim},
|
||||||
|
{"name": f"{self._text_key}", "type": "string"},
|
||||||
|
{"name": ".*", "type": "auto"},
|
||||||
|
]
|
||||||
|
self._typesense_client.collections.create(
|
||||||
|
{"name": self._typesense_collection_name, "fields": fields}
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Run more texts through the embedding and add to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: Iterable of strings to add to the vectorstore.
|
||||||
|
metadatas: Optional list of metadatas associated with the texts.
|
||||||
|
ids: Optional list of ids to associate with the texts.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ids from adding the texts into the vectorstore.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typesense.exceptions import ObjectNotFound
|
||||||
|
|
||||||
|
docs = self._prep_texts(texts, metadatas, ids)
|
||||||
|
try:
|
||||||
|
self._collection.documents.import_(docs, {"action": "upsert"})
|
||||||
|
except ObjectNotFound:
|
||||||
|
# Create the collection if it doesn't already exist
|
||||||
|
self._create_collection(len(docs[0]["vec"]))
|
||||||
|
self._collection.documents.import_(docs, {"action": "upsert"})
|
||||||
|
return [doc["id"] for doc in docs]
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
filter: Optional[str] = "",
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Return typesense documents most similar to query, along with scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
filter: typesense filter_by expression to filter documents on
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query and score for each
|
||||||
|
"""
|
||||||
|
embedded_query = [str(x) for x in self._embedding.embed_query(query)]
|
||||||
|
query_obj = {
|
||||||
|
"q": "*",
|
||||||
|
"vector_query": f'vec:([{",".join(embedded_query)}], k:{k})',
|
||||||
|
"filter_by": filter,
|
||||||
|
"collection": self._typesense_collection_name,
|
||||||
|
}
|
||||||
|
docs = []
|
||||||
|
response = self._typesense_client.multi_search.perform(
|
||||||
|
{"searches": [query_obj]}, {}
|
||||||
|
)
|
||||||
|
for hit in response["results"][0]["hits"]:
|
||||||
|
document = hit["document"]
|
||||||
|
metadata = document["metadata"]
|
||||||
|
text = document[self._text_key]
|
||||||
|
score = hit["vector_distance"]
|
||||||
|
docs.append((Document(page_content=text, metadata=metadata), score))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
filter: Optional[str] = "",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return typesense documents most similar to query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
filter: typesense filter_by expression to filter documents on
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query and score for each
|
||||||
|
"""
|
||||||
|
docs_and_score = self.similarity_search_with_score(query, k=k, filter=filter)
|
||||||
|
return [doc for doc, _ in docs_and_score]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_client_params(
|
||||||
|
cls,
|
||||||
|
embedding: Embeddings,
|
||||||
|
*,
|
||||||
|
host: str = "localhost",
|
||||||
|
port: Union[str, int] = "8108",
|
||||||
|
protocol: str = "http",
|
||||||
|
typesense_api_key: Optional[str] = None,
|
||||||
|
connection_timeout_seconds: int = 2,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Typesense:
|
||||||
|
"""Initialize Typesense directly from client parameters.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.embedding.openai import OpenAIEmbeddings
|
||||||
|
from langchain.vectorstores import Typesense
|
||||||
|
|
||||||
|
# Pass in typesense_api_key as kwarg or set env var "TYPESENSE_API_KEY".
|
||||||
|
vectorstore = Typesense(
|
||||||
|
OpenAIEmbeddings(),
|
||||||
|
host="localhost",
|
||||||
|
port="8108",
|
||||||
|
protocol="http",
|
||||||
|
typesense_collection_name="langchain-memory",
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from typesense import Client
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"Could not import typesense python package. "
|
||||||
|
"Please install it with `pip install typesense`."
|
||||||
|
)
|
||||||
|
|
||||||
|
node = {
|
||||||
|
"host": host,
|
||||||
|
"port": str(port),
|
||||||
|
"protocol": protocol,
|
||||||
|
}
|
||||||
|
typesense_api_key = typesense_api_key or get_from_env(
|
||||||
|
"typesense_api_key", "TYPESENSE_API_KEY"
|
||||||
|
)
|
||||||
|
client_config = {
|
||||||
|
"nodes": [node],
|
||||||
|
"api_key": typesense_api_key,
|
||||||
|
"connection_timeout_seconds": connection_timeout_seconds,
|
||||||
|
}
|
||||||
|
return cls(Client(client_config), embedding, **kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_texts(
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Embeddings,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
typesense_client: Optional[Client] = None,
|
||||||
|
typesense_client_params: Optional[dict] = None,
|
||||||
|
typesense_collection_name: Optional[str] = None,
|
||||||
|
text_key: str = "text",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Typesense:
|
||||||
|
"""Construct Typesense wrapper from raw text."""
|
||||||
|
if typesense_client:
|
||||||
|
vectorstore = cls(typesense_client, embedding, **kwargs)
|
||||||
|
elif typesense_client_params:
|
||||||
|
vectorstore = cls.from_client_params(
|
||||||
|
embedding, **typesense_client_params, **kwargs
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Must specify one of typesense_client or typesense_client_params."
|
||||||
|
)
|
||||||
|
vectorstore.add_texts(texts, metadatas=metadatas, ids=ids)
|
||||||
|
return vectorstore
|
Loading…
Reference in New Issue