From a0064330b153bc7e01e2faadd5554e4c011425e9 Mon Sep 17 00:00:00 2001 From: Karim Lalani Date: Fri, 15 Dec 2023 15:34:51 -0600 Subject: [PATCH] community[minor]: Add SurrealDB vectorstore (#13331) **Description:** Vectorstore implementation around [SurrealDB](https://www.surrealdb.com) --------- Co-authored-by: Bagatur --- .../integrations/vectorstores/surrealdb.ipynb | 288 ++++++++++++ .../vectorstores/__init__.py | 9 + .../vectorstores/surrealdb.py | 434 ++++++++++++++++++ .../vectorstores/test_public_api.py | 1 + 4 files changed, 732 insertions(+) create mode 100644 docs/docs/integrations/vectorstores/surrealdb.ipynb create mode 100644 libs/community/langchain_community/vectorstores/surrealdb.py diff --git a/docs/docs/integrations/vectorstores/surrealdb.ipynb b/docs/docs/integrations/vectorstores/surrealdb.ipynb new file mode 100644 index 0000000000..a65378697b --- /dev/null +++ b/docs/docs/integrations/vectorstores/surrealdb.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a3afefb0-7e99-4912-a222-c6b186da11af", + "metadata": {}, + "source": [ + "# SurrealDB\n", + "\n", + ">[SurrealDB](https://surrealdb.com/) is an end-to-end cloud-native database designed for modern applications, including web, mobile, serverless, Jamstack, backend, and traditional applications. With SurrealDB, you can simplify your database and API infrastructure, reduce development time, and build secure, performant apps quickly and cost-effectively.\n", + ">\n", + ">**Key features of SurrealDB include:**\n", + ">\n", + ">* **Reduces development time:** SurrealDB simplifies your database and API stack by removing the need for most server-side components, allowing you to build secure, performant apps faster and cheaper.\n", + ">* **Real-time collaborative API backend service:** SurrealDB functions as both a database and an API backend service, enabling real-time collaboration.\n", + ">* **Support for multiple querying languages:** SurrealDB supports SQL querying from client devices, GraphQL, ACID transactions, WebSocket connections, structured and unstructured data, graph querying, full-text indexing, and geospatial querying.\n", + ">* **Granular access control:** SurrealDB provides row-level permissions-based access control, giving you the ability to manage data access with precision.\n", + ">\n", + ">View the [features](https://surrealdb.com/features), the latest [releases](https://surrealdb.com/releases), and [documentation](https://surrealdb.com/docs).\n", + "\n", + "This notebook shows how to use functionality related to the `SurrealDBStore`." + ] + }, + { + "cell_type": "markdown", + "id": "5031a3ec", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Uncomment the below cells to install surrealdb." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cd7391f-7759-4a21-952a-2ec972d818c6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# !pip install -U surrealdb langchain langchain-community" + ] + }, + { + "cell_type": "markdown", + "id": "6e57a389-f637-4b8f-9ab2-759ae7485f78", + "metadata": {}, + "source": [ + "## Using SurrealDBStore" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e49be085-ddf1-4028-8c0c-97836ce4a873", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_community.embeddings import HuggingFaceEmbeddings\n", + "from langchain_community.vectorstores import SurrealDBStore" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "38222aee-adc5-44c2-913c-97977b394cf5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents = TextLoader(\"../../modules/state_of_the_union.txt\").load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = HuggingFaceEmbeddings()" + ] + }, + { + "cell_type": "markdown", + "id": "8e240306-803c-4c1a-b036-b9fc69eb6cba", + "metadata": {}, + "source": [ + "### Creating a SurrealDBStore object" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ff9d0304-1e11-4db2-9454-1350db7907e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['documents:th7j29cjsx6495wluo7e',\n", + " 'documents:qkqhhjnl7ahbhr07euky',\n", + " 'documents:8kd6xw8o7y0l171iqry0',\n", + " 'documents:33ejf42dlkmavol9si74',\n", + " 'documents:f7y4dbs7eitqz58xt1p5']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db = SurrealDBStore(\n", + " dburl=\"http://localhost:8000/rpc\", # url for the hosted SurrealDB database\n", + " embedding_function=embeddings,\n", + " db_user=\"root\", # SurrealDB credentials if needed: db username\n", + " db_pass=\"root\", # SurrealDB credentials if needed: db password\n", + " # ns=\"langchain\", # namespace to use for vectorstore\n", + " # db=\"database\", # database to use for vectorstore\n", + " # collection=\"documents\", #collection to use for vectorstore\n", + ")\n", + "\n", + "# this is needed to initialize the underlying async library for SurrealDB\n", + "await db.initialize()\n", + "\n", + "# delete all existing documents from the vectorstore collection\n", + "await db.adelete()\n", + "\n", + "# add documents to the vectorstore\n", + "ids = await db.aadd_documents(docs)\n", + "\n", + "# document ids of the added documents\n", + "ids[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "94a742a9-9507-4076-9cc3-616a4ed6866f", + "metadata": {}, + "source": [ + "### (alternatively) Create a SurrealDBStore object and add documents" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "73d66563-4e1f-4edf-9e95-5fc9adcfa2cb", + "metadata": {}, + "outputs": [], + "source": [ + "await db.adelete()\n", + "\n", + "db = await SurrealDBStore.afrom_documents(\n", + " dburl=\"http://localhost:8000/rpc\", # url for the hosted SurrealDB database\n", + " embedding=embeddings,\n", + " documents=docs,\n", + " db_user=\"root\", # SurrealDB credentials if needed: db username\n", + " db_pass=\"root\", # SurrealDB credentials if needed: db password\n", + " # ns=\"langchain\", # namespace to use for vectorstore\n", + " # db=\"database\", # database to use for vectorstore\n", + " # collection=\"documents\", #collection to use for vectorstore\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "efbb6684-3846-4332-a624-ddd4d75844c1", + "metadata": {}, + "source": [ + "### Similarity search" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "aa28a7f8-41d0-4299-84eb-91d1576e8a63", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = await db.asimilarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1eb16d2a-b466-456a-b412-5e74bb8523dd", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "43896697-f99e-47b6-9117-47a25e9afa9c", + "metadata": {}, + "source": [ + "### Similarity search with score" + ] + }, + { + "cell_type": "markdown", + "id": "414a9bc9", + "metadata": {}, + "source": [ + "The returned distance score is cosine distance. Therefore, a lower score is better." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8e9eef05-1516-469a-ad36-880c69aef7a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = await db.asimilarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bd5fb0e4-2a94-4bb4-af8a-27327ecb1a7f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'id': 'documents:639m99rzwqlm9imcwg13'}),\n", + " 0.39839545290036454)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index df64f984cc..6d490362fa 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -360,6 +360,12 @@ def _import_supabase() -> Any: return SupabaseVectorStore +def _import_surrealdb() -> Any: + from langchain_community.vectorstores.surrealdb import SurrealDBStore + + return SurrealDBStore + + def _import_tair() -> Any: from langchain_community.vectorstores.tair import Tair @@ -551,6 +557,8 @@ def __getattr__(name: str) -> Any: return _import_starrocks() elif name == "SupabaseVectorStore": return _import_supabase() + elif name == "SurrealDBStore": + return _import_surrealdb() elif name == "Tair": return _import_tair() elif name == "TencentVectorDB": @@ -637,6 +645,7 @@ __all__ = [ "SQLiteVSS", "StarRocks", "SupabaseVectorStore", + "SurrealDBStore", "Tair", "TileDB", "Tigris", diff --git a/libs/community/langchain_community/vectorstores/surrealdb.py b/libs/community/langchain_community/vectorstores/surrealdb.py new file mode 100644 index 0000000000..e8608d6633 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/surrealdb.py @@ -0,0 +1,434 @@ +import asyncio +from typing import ( + Any, + Iterable, + List, + Optional, + Tuple, +) + +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + + +class SurrealDBStore(VectorStore): + """ + SurrealDB as Vector Store. + + To use, you should have the ``surrealdb`` python package installed. + + Args: + embedding_function: Embedding function to use. + dburl: SurrealDB connection url + ns: surrealdb namespace for the vector store. (default: "langchain") + db: surrealdb database for the vector store. (default: "database") + collection: surrealdb collection for the vector store. + (default: "documents") + + (optional) db_user and db_pass: surrealdb credentials + + Example: + .. code-block:: python + + from langchain.vectorstores.surrealdb import SurrealDBStore + from langchain.embeddings import HuggingFaceEmbeddings + + embedding_function = HuggingFaceEmbeddings() + dburl = "ws://localhost:8000/rpc" + ns = "langchain" + db = "docstore" + collection = "documents" + db_user = "root" + db_pass = "root" + + sdb = SurrealDBStore.from_texts( + texts=texts, + embedding=embedding_function, + dburl, + ns, db, collection, + db_user=db_user, db_pass=db_pass) + """ + + def __init__( + self, + embedding_function: Embeddings, + **kwargs: Any, + ) -> None: + from surrealdb import Surreal + + self.collection = kwargs.pop("collection", "documents") + self.ns = kwargs.pop("ns", "langchain") + self.db = kwargs.pop("db", "database") + self.dburl = kwargs.pop("dburl", "ws://localhost:8000/rpc") + self.embedding_function = embedding_function + self.sdb = Surreal() + self.kwargs = kwargs + + async def initialize(self) -> None: + """ + Initialize connection to surrealdb database + and authenticate if credentials are provided + """ + await self.sdb.connect(self.dburl) + if "db_user" in self.kwargs and "db_pass" in self.kwargs: + user = self.kwargs.get("db_user") + password = self.kwargs.get("db_pass") + await self.sdb.signin({"user": user, "pass": password}) + + await self.sdb.use(self.ns, self.db) + + @property + def embeddings(self) -> Optional[Embeddings]: + return ( + self.embedding_function + if isinstance(self.embedding_function, Embeddings) + else None + ) + + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Add list of text along with embeddings to the vector store asynchronously + + Args: + texts (Iterable[str]): collection of text to add to the database + + Returns: + List of ids for the newly inserted documents + """ + embeddings = self.embedding_function.embed_documents(list(texts)) + ids = [] + for idx, text in enumerate(texts): + record = await self.sdb.create( + self.collection, {"text": text, "embedding": embeddings[idx]} + ) + ids.append(record[0]["id"]) + return ids + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Add list of text along with embeddings to the vector store + + Args: + texts (Iterable[str]): collection of text to add to the database + + Returns: + List of ids for the newly inserted documents + """ + return asyncio.run(self.aadd_texts(texts, metadatas, **kwargs)) + + async def adelete( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Optional[bool]: + """Delete by document ID asynchronously. + + Args: + ids: List of ids to delete. + **kwargs: Other keyword arguments that subclasses might use. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise. + """ + + if ids is None: + await self.sdb.delete(self.collection) + return True + else: + if isinstance(ids, str): + await self.sdb.delete(ids) + return True + else: + if isinstance(ids, list) and len(ids) > 0: + _ = [await self.sdb.delete(id) for id in ids] + return True + return False + + def delete( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Optional[bool]: + """Delete by document ID. + + Args: + ids: List of ids to delete. + **kwargs: Other keyword arguments that subclasses might use. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise. + """ + + async def _delete(ids: Optional[List[str]], **kwargs: Any) -> Optional[bool]: + await self.initialize() + return await self.adelete(ids=ids, **kwargs) + + return asyncio.run(_delete(ids, **kwargs)) + + async def _asimilarity_search_by_vector_with_score( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Run similarity search for query embedding asynchronously + and return documents and scores + + Args: + embedding (List[float]): Query embedding. + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar along with scores + """ + args = { + "collection": self.collection, + "embedding": embedding, + "k": k, + "score_threshold": kwargs.get("score_threshold", 0), + } + query = """select id, text, + vector::similarity::cosine(embedding,{embedding}) as similarity + from {collection} + where vector::similarity::cosine(embedding,{embedding}) >= {score_threshold} + order by similarity desc LIMIT {k} + """.format(**args) + + results = await self.sdb.query(query) + + if len(results) == 0: + return [] + + return [ + ( + Document(page_content=result["text"], metadata={"id": result["id"]}), + result["similarity"], + ) + for result in results[0]["result"] + ] + + async def asimilarity_search_with_relevance_scores( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Run similarity search asynchronously and return relevance scores + + Args: + query (str): Query + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar along with relevance scores + """ + query_embedding = self.embedding_function.embed_query(query) + return [ + (document, similarity) + for document, similarity in ( + await self._asimilarity_search_by_vector_with_score( + query_embedding, k, **kwargs + ) + ) + ] + + def similarity_search_with_relevance_scores( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Run similarity search synchronously and return relevance scores + + Args: + query (str): Query + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar along with relevance scores + """ + + async def _similarity_search_with_relevance_scores() -> ( + List[Tuple[Document, float]] + ): + await self.initialize() + return await self.asimilarity_search_with_relevance_scores( + query, k, **kwargs + ) + + return asyncio.run(_similarity_search_with_relevance_scores()) + + async def asimilarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Run similarity search asynchronously and return distance scores + + Args: + query (str): Query + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar along with relevance distance scores + """ + query_embedding = self.embedding_function.embed_query(query) + return [ + (document, similarity) + for document, similarity in ( + await self._asimilarity_search_by_vector_with_score( + query_embedding, k, **kwargs + ) + ) + ] + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Run similarity search synchronously and return distance scores + + Args: + query (str): Query + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar along with relevance distance scores + """ + + async def _similarity_search_with_score() -> List[Tuple[Document, float]]: + await self.initialize() + return await self.asimilarity_search_with_score(query, k, **kwargs) + + return asyncio.run(_similarity_search_with_score()) + + async def asimilarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + """Run similarity search on query embedding asynchronously + + Args: + embedding (List[float]): Query embedding + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar to the query + """ + return [ + document + for document, _ in await self._asimilarity_search_by_vector_with_score( + embedding, k, **kwargs + ) + ] + + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + """Run similarity search on query embedding + + Args: + embedding (List[float]): Query embedding + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar to the query + """ + + async def _similarity_search_by_vector() -> List[Document]: + await self.initialize() + return await self.asimilarity_search_by_vector(embedding, k, **kwargs) + + return asyncio.run(_similarity_search_by_vector()) + + async def asimilarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Run similarity search on query asynchronously + + Args: + query (str): Query + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar to the query + """ + query_embedding = self.embedding_function.embed_query(query) + return await self.asimilarity_search_by_vector(query_embedding, k, **kwargs) + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Run similarity search on query + + Args: + query (str): Query + k (int): Number of results to return. Defaults to 4. + + Returns: + List of Documents most similar to the query + """ + + async def _similarity_search() -> List[Document]: + await self.initialize() + return await self.asimilarity_search(query, k, **kwargs) + + return asyncio.run(_similarity_search()) + + @classmethod + async def afrom_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> "SurrealDBStore": + """Create SurrealDBStore from list of text asynchronously + + Args: + texts (List[str]): list of text to vectorize and store + embedding (Optional[Embeddings]): Embedding function. + dburl (str): SurrealDB connection url + (default: "ws://localhost:8000/rpc") + ns (str): surrealdb namespace for the vector store. + (default: "langchain") + db (str): surrealdb database for the vector store. + (default: "database") + collection (str): surrealdb collection for the vector store. + (default: "documents") + + (optional) db_user and db_pass: surrealdb credentials + + Returns: + SurrealDBStore object initialized and ready for use.""" + + sdb = cls(embedding, **kwargs) + await sdb.initialize() + await sdb.aadd_texts(texts) + return sdb + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> "SurrealDBStore": + """Create SurrealDBStore from list of text + + Args: + texts (List[str]): list of text to vectorize and store + embedding (Optional[Embeddings]): Embedding function. + dburl (str): SurrealDB connection url + ns (str): surrealdb namespace for the vector store. + (default: "langchain") + db (str): surrealdb database for the vector store. + (default: "database") + collection (str): surrealdb collection for the vector store. + (default: "documents") + + (optional) db_user and db_pass: surrealdb credentials + + Returns: + SurrealDBStore object initialized and ready for use.""" + sdb = asyncio.run(cls.afrom_texts(texts, embedding, metadatas, **kwargs)) + return sdb diff --git a/libs/community/tests/unit_tests/vectorstores/test_public_api.py b/libs/community/tests/unit_tests/vectorstores/test_public_api.py index 0ae34747ab..e994afe066 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py @@ -53,6 +53,7 @@ _EXPECTED = [ "SQLiteVSS", "StarRocks", "SupabaseVectorStore", + "SurrealDBStore", "Tair", "TileDB", "Tigris",