diff --git a/docs/modules/indexes/vectorstores/examples/typesense.ipynb b/docs/modules/indexes/vectorstores/examples/typesense.ipynb new file mode 100644 index 0000000000..f9b57fd1a8 --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/typesense.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Typesense\n", + "\n", + "> [Typesense](https://typesense.org) is an open source, in-memory search engine, that you can either [self-host](https://typesense.org/docs/guide/install-typesense.html#option-2-local-machine-self-hosting) or run on [Typesense Cloud](https://cloud.typesense.org/).\n", + ">\n", + "> Typesense focuses on performance by storing the entire index in RAM (with a backup on disk) and also focuses on providing an out-of-the-box developer experience by simplifying available options and setting good defaults.\n", + ">\n", + "> It also lets you combine attribute-based filtering together with vector queries, to fetch the most relevant documents." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "This notebook shows you how to use Typesense as your VectorStore." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's first install our dependencies:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "!pip install typesense openapi-schema-pydantic openai tiktoken" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-05-23T22:48:02.968822Z", + "start_time": "2023-05-23T22:47:48.574094Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import Typesense\n", + "from langchain.document_loaders import TextLoader" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-05-23T22:50:34.775893Z", + "start_time": "2023-05-23T22:50:34.771889Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's import our test dataset:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 19, + "outputs": [], + "source": [ + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-05-23T22:56:19.093489Z", + "start_time": "2023-05-23T22:56:19.089Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "docsearch = Typesense.from_documents(docs,\n", + " embeddings,\n", + " typesense_client_params={\n", + " 'host': 'localhost', # Use xxx.a1.typesense.net for Typesense Cloud\n", + " 'port': '8108', # Use 443 for Typesense Cloud\n", + " 'protocol': 'http', # Use https for Typesense Cloud\n", + " 'typesense_api_key': 'xyz',\n", + " 'typesense_collection_name': 'lang-chain'\n", + " })" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Similarity Search" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "found_docs = docsearch.similarity_search(query)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print(found_docs[0].page_content)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Typesense as a Retriever\n", + "\n", + "Typesense, as all the other vector stores, is a LangChain Retriever, by using cosine similarity." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "retriever = docsearch.as_retriever()\n", + "retriever" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "retriever.get_relevant_documents(query)[0]" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index ade924590c..0b5755af62 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -17,6 +17,7 @@ from langchain.vectorstores.qdrant import Qdrant from langchain.vectorstores.redis import Redis from langchain.vectorstores.supabase import SupabaseVectorStore from langchain.vectorstores.tair import Tair +from langchain.vectorstores.typesense import Typesense from langchain.vectorstores.weaviate import Weaviate from langchain.vectorstores.zilliz import Zilliz @@ -43,4 +44,5 @@ __all__ = [ "LanceDB", "DocArrayHnswSearch", "DocArrayInMemorySearch", + "Typesense", ] diff --git a/langchain/vectorstores/typesense.py b/langchain/vectorstores/typesense.py new file mode 100644 index 0000000000..63ddab74b0 --- /dev/null +++ b/langchain/vectorstores/typesense.py @@ -0,0 +1,270 @@ +"""Wrapper around Typesense vector search""" +from __future__ import annotations + +import uuid +from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.utils import get_from_env +from langchain.vectorstores.base import VectorStore + +if TYPE_CHECKING: + from typesense.client import Client + from typesense.collection import Collection + + +class Typesense(VectorStore): + """Wrapper around Typesense vector search. + + To use, you should have the ``typesense`` python package installed. + + Example: + .. code-block:: python + + from langchain.embedding.openai import OpenAIEmbeddings + from langchain.vectorstores import Typesense + import typesense + + node = { + "host": "localhost", # For Typesense Cloud use xxx.a1.typesense.net + "port": "8108", # For Typesense Cloud use 443 + "protocol": "http" # For Typesense Cloud use https + } + typesense_client = typesense.Client( + { + "nodes": [node], + "api_key": "", + "connection_timeout_seconds": 2 + } + ) + typesense_collection_name = "langchain-memory" + + embedding = OpenAIEmbeddings() + vectorstore = Typesense( + typesense_client, + typesense_collection_name, + embedding.embed_query, + "text", + ) + """ + + def __init__( + self, + typesense_client: Client, + embedding: Embeddings, + *, + typesense_collection_name: Optional[str] = None, + text_key: str = "text", + ): + """Initialize with Typesense client.""" + try: + from typesense import Client + except ImportError: + raise ValueError( + "Could not import typesense python package. " + "Please install it with `pip install typesense`." + ) + if not isinstance(typesense_client, Client): + raise ValueError( + f"typesense_client should be an instance of typesense.Client, " + f"got {type(typesense_client)}" + ) + self._typesense_client = typesense_client + self._embedding = embedding + self._typesense_collection_name = ( + typesense_collection_name or f"langchain-{str(uuid.uuid4())}" + ) + self._text_key = text_key + + @property + def _collection(self) -> Collection: + return self._typesense_client.collections[self._typesense_collection_name] + + def _prep_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]], + ids: Optional[List[str]], + ) -> List[dict]: + """Embed and create the documents""" + _ids = ids or (str(uuid.uuid4()) for _ in texts) + _metadatas: Iterable[dict] = metadatas or ({} for _ in texts) + embedded_texts = self._embedding.embed_documents(list(texts)) + return [ + {"id": _id, "vec": vec, f"{self._text_key}": text, "metadata": metadata} + for _id, vec, text, metadata in zip(_ids, embedded_texts, texts, _metadatas) + ] + + def _create_collection(self, num_dim: int) -> None: + fields = [ + {"name": "vec", "type": "float[]", "num_dim": num_dim}, + {"name": f"{self._text_key}", "type": "string"}, + {"name": ".*", "type": "auto"}, + ] + self._typesense_client.collections.create( + {"name": self._typesense_collection_name, "fields": fields} + ) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embedding and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of ids to associate with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + + """ + from typesense.exceptions import ObjectNotFound + + docs = self._prep_texts(texts, metadatas, ids) + try: + self._collection.documents.import_(docs, {"action": "upsert"}) + except ObjectNotFound: + # Create the collection if it doesn't already exist + self._create_collection(len(docs[0]["vec"])) + self._collection.documents.import_(docs, {"action": "upsert"}) + return [doc["id"] for doc in docs] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + filter: Optional[str] = "", + ) -> List[Tuple[Document, float]]: + """Return typesense documents most similar to query, along with scores. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: typesense filter_by expression to filter documents on + + Returns: + List of Documents most similar to the query and score for each + """ + embedded_query = [str(x) for x in self._embedding.embed_query(query)] + query_obj = { + "q": "*", + "vector_query": f'vec:([{",".join(embedded_query)}], k:{k})', + "filter_by": filter, + "collection": self._typesense_collection_name, + } + docs = [] + response = self._typesense_client.multi_search.perform( + {"searches": [query_obj]}, {} + ) + for hit in response["results"][0]["hits"]: + document = hit["document"] + metadata = document["metadata"] + text = document[self._text_key] + score = hit["vector_distance"] + docs.append((Document(page_content=text, metadata=metadata), score)) + return docs + + def similarity_search( + self, + query: str, + k: int = 4, + filter: Optional[str] = "", + **kwargs: Any, + ) -> List[Document]: + """Return typesense documents most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: typesense filter_by expression to filter documents on + + Returns: + List of Documents most similar to the query and score for each + """ + docs_and_score = self.similarity_search_with_score(query, k=k, filter=filter) + return [doc for doc, _ in docs_and_score] + + @classmethod + def from_client_params( + cls, + embedding: Embeddings, + *, + host: str = "localhost", + port: Union[str, int] = "8108", + protocol: str = "http", + typesense_api_key: Optional[str] = None, + connection_timeout_seconds: int = 2, + **kwargs: Any, + ) -> Typesense: + """Initialize Typesense directly from client parameters. + + Example: + .. code-block:: python + + from langchain.embedding.openai import OpenAIEmbeddings + from langchain.vectorstores import Typesense + + # Pass in typesense_api_key as kwarg or set env var "TYPESENSE_API_KEY". + vectorstore = Typesense( + OpenAIEmbeddings(), + host="localhost", + port="8108", + protocol="http", + typesense_collection_name="langchain-memory", + ) + """ + try: + from typesense import Client + except ImportError: + raise ValueError( + "Could not import typesense python package. " + "Please install it with `pip install typesense`." + ) + + node = { + "host": host, + "port": str(port), + "protocol": protocol, + } + typesense_api_key = typesense_api_key or get_from_env( + "typesense_api_key", "TYPESENSE_API_KEY" + ) + client_config = { + "nodes": [node], + "api_key": typesense_api_key, + "connection_timeout_seconds": connection_timeout_seconds, + } + return cls(Client(client_config), embedding, **kwargs) + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + typesense_client: Optional[Client] = None, + typesense_client_params: Optional[dict] = None, + typesense_collection_name: Optional[str] = None, + text_key: str = "text", + **kwargs: Any, + ) -> Typesense: + """Construct Typesense wrapper from raw text.""" + if typesense_client: + vectorstore = cls(typesense_client, embedding, **kwargs) + elif typesense_client_params: + vectorstore = cls.from_client_params( + embedding, **typesense_client_params, **kwargs + ) + else: + raise ValueError( + "Must specify one of typesense_client or typesense_client_params." + ) + vectorstore.add_texts(texts, metadatas=metadatas, ids=ids) + return vectorstore