community[minor]: Add SurrealDB vectorstore (#13331)

**Description:** Vectorstore implementation around
[SurrealDB](https://www.surrealdb.com)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/14680/head
Karim Lalani 7 months ago committed by GitHub
parent c5296fd42c
commit a0064330b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,288 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a3afefb0-7e99-4912-a222-c6b186da11af",
"metadata": {},
"source": [
"# SurrealDB\n",
"\n",
">[SurrealDB](https://surrealdb.com/) is an end-to-end cloud-native database designed for modern applications, including web, mobile, serverless, Jamstack, backend, and traditional applications. With SurrealDB, you can simplify your database and API infrastructure, reduce development time, and build secure, performant apps quickly and cost-effectively.\n",
">\n",
">**Key features of SurrealDB include:**\n",
">\n",
">* **Reduces development time:** SurrealDB simplifies your database and API stack by removing the need for most server-side components, allowing you to build secure, performant apps faster and cheaper.\n",
">* **Real-time collaborative API backend service:** SurrealDB functions as both a database and an API backend service, enabling real-time collaboration.\n",
">* **Support for multiple querying languages:** SurrealDB supports SQL querying from client devices, GraphQL, ACID transactions, WebSocket connections, structured and unstructured data, graph querying, full-text indexing, and geospatial querying.\n",
">* **Granular access control:** SurrealDB provides row-level permissions-based access control, giving you the ability to manage data access with precision.\n",
">\n",
">View the [features](https://surrealdb.com/features), the latest [releases](https://surrealdb.com/releases), and [documentation](https://surrealdb.com/docs).\n",
"\n",
"This notebook shows how to use functionality related to the `SurrealDBStore`."
]
},
{
"cell_type": "markdown",
"id": "5031a3ec",
"metadata": {},
"source": [
"## Setup\n",
"\n",
"Uncomment the below cells to install surrealdb."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cd7391f-7759-4a21-952a-2ec972d818c6",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# !pip install -U surrealdb langchain langchain-community"
]
},
{
"cell_type": "markdown",
"id": "6e57a389-f637-4b8f-9ab2-759ae7485f78",
"metadata": {},
"source": [
"## Using SurrealDBStore"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e49be085-ddf1-4028-8c0c-97836ce4a873",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain_community.document_loaders import TextLoader\n",
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
"from langchain_community.vectorstores import SurrealDBStore"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "38222aee-adc5-44c2-913c-97977b394cf5",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"documents = TextLoader(\"../../modules/state_of_the_union.txt\").load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = HuggingFaceEmbeddings()"
]
},
{
"cell_type": "markdown",
"id": "8e240306-803c-4c1a-b036-b9fc69eb6cba",
"metadata": {},
"source": [
"### Creating a SurrealDBStore object"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ff9d0304-1e11-4db2-9454-1350db7907e6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['documents:th7j29cjsx6495wluo7e',\n",
" 'documents:qkqhhjnl7ahbhr07euky',\n",
" 'documents:8kd6xw8o7y0l171iqry0',\n",
" 'documents:33ejf42dlkmavol9si74',\n",
" 'documents:f7y4dbs7eitqz58xt1p5']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db = SurrealDBStore(\n",
" dburl=\"http://localhost:8000/rpc\", # url for the hosted SurrealDB database\n",
" embedding_function=embeddings,\n",
" db_user=\"root\", # SurrealDB credentials if needed: db username\n",
" db_pass=\"root\", # SurrealDB credentials if needed: db password\n",
" # ns=\"langchain\", # namespace to use for vectorstore\n",
" # db=\"database\", # database to use for vectorstore\n",
" # collection=\"documents\", #collection to use for vectorstore\n",
")\n",
"\n",
"# this is needed to initialize the underlying async library for SurrealDB\n",
"await db.initialize()\n",
"\n",
"# delete all existing documents from the vectorstore collection\n",
"await db.adelete()\n",
"\n",
"# add documents to the vectorstore\n",
"ids = await db.aadd_documents(docs)\n",
"\n",
"# document ids of the added documents\n",
"ids[:5]"
]
},
{
"cell_type": "markdown",
"id": "94a742a9-9507-4076-9cc3-616a4ed6866f",
"metadata": {},
"source": [
"### (alternatively) Create a SurrealDBStore object and add documents"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "73d66563-4e1f-4edf-9e95-5fc9adcfa2cb",
"metadata": {},
"outputs": [],
"source": [
"await db.adelete()\n",
"\n",
"db = await SurrealDBStore.afrom_documents(\n",
" dburl=\"http://localhost:8000/rpc\", # url for the hosted SurrealDB database\n",
" embedding=embeddings,\n",
" documents=docs,\n",
" db_user=\"root\", # SurrealDB credentials if needed: db username\n",
" db_pass=\"root\", # SurrealDB credentials if needed: db password\n",
" # ns=\"langchain\", # namespace to use for vectorstore\n",
" # db=\"database\", # database to use for vectorstore\n",
" # collection=\"documents\", #collection to use for vectorstore\n",
")"
]
},
{
"cell_type": "markdown",
"id": "efbb6684-3846-4332-a624-ddd4d75844c1",
"metadata": {},
"source": [
"### Similarity search"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "aa28a7f8-41d0-4299-84eb-91d1576e8a63",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = await db.asimilarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1eb16d2a-b466-456a-b412-5e74bb8523dd",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
"\n",
"Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n"
]
}
],
"source": [
"print(docs[0].page_content)"
]
},
{
"cell_type": "markdown",
"id": "43896697-f99e-47b6-9117-47a25e9afa9c",
"metadata": {},
"source": [
"### Similarity search with score"
]
},
{
"cell_type": "markdown",
"id": "414a9bc9",
"metadata": {},
"source": [
"The returned distance score is cosine distance. Therefore, a lower score is better."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8e9eef05-1516-469a-ad36-880c69aef7a9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"docs = await db.asimilarity_search_with_score(query)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bd5fb0e4-2a94-4bb4-af8a-27327ecb1a7f",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.', metadata={'id': 'documents:639m99rzwqlm9imcwg13'}),\n",
" 0.39839545290036454)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -360,6 +360,12 @@ def _import_supabase() -> Any:
return SupabaseVectorStore
def _import_surrealdb() -> Any:
from langchain_community.vectorstores.surrealdb import SurrealDBStore
return SurrealDBStore
def _import_tair() -> Any:
from langchain_community.vectorstores.tair import Tair
@ -551,6 +557,8 @@ def __getattr__(name: str) -> Any:
return _import_starrocks()
elif name == "SupabaseVectorStore":
return _import_supabase()
elif name == "SurrealDBStore":
return _import_surrealdb()
elif name == "Tair":
return _import_tair()
elif name == "TencentVectorDB":
@ -637,6 +645,7 @@ __all__ = [
"SQLiteVSS",
"StarRocks",
"SupabaseVectorStore",
"SurrealDBStore",
"Tair",
"TileDB",
"Tigris",

@ -0,0 +1,434 @@
import asyncio
from typing import (
Any,
Iterable,
List,
Optional,
Tuple,
)
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
class SurrealDBStore(VectorStore):
"""
SurrealDB as Vector Store.
To use, you should have the ``surrealdb`` python package installed.
Args:
embedding_function: Embedding function to use.
dburl: SurrealDB connection url
ns: surrealdb namespace for the vector store. (default: "langchain")
db: surrealdb database for the vector store. (default: "database")
collection: surrealdb collection for the vector store.
(default: "documents")
(optional) db_user and db_pass: surrealdb credentials
Example:
.. code-block:: python
from langchain.vectorstores.surrealdb import SurrealDBStore
from langchain.embeddings import HuggingFaceEmbeddings
embedding_function = HuggingFaceEmbeddings()
dburl = "ws://localhost:8000/rpc"
ns = "langchain"
db = "docstore"
collection = "documents"
db_user = "root"
db_pass = "root"
sdb = SurrealDBStore.from_texts(
texts=texts,
embedding=embedding_function,
dburl,
ns, db, collection,
db_user=db_user, db_pass=db_pass)
"""
def __init__(
self,
embedding_function: Embeddings,
**kwargs: Any,
) -> None:
from surrealdb import Surreal
self.collection = kwargs.pop("collection", "documents")
self.ns = kwargs.pop("ns", "langchain")
self.db = kwargs.pop("db", "database")
self.dburl = kwargs.pop("dburl", "ws://localhost:8000/rpc")
self.embedding_function = embedding_function
self.sdb = Surreal()
self.kwargs = kwargs
async def initialize(self) -> None:
"""
Initialize connection to surrealdb database
and authenticate if credentials are provided
"""
await self.sdb.connect(self.dburl)
if "db_user" in self.kwargs and "db_pass" in self.kwargs:
user = self.kwargs.get("db_user")
password = self.kwargs.get("db_pass")
await self.sdb.signin({"user": user, "pass": password})
await self.sdb.use(self.ns, self.db)
@property
def embeddings(self) -> Optional[Embeddings]:
return (
self.embedding_function
if isinstance(self.embedding_function, Embeddings)
else None
)
async def aadd_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Add list of text along with embeddings to the vector store asynchronously
Args:
texts (Iterable[str]): collection of text to add to the database
Returns:
List of ids for the newly inserted documents
"""
embeddings = self.embedding_function.embed_documents(list(texts))
ids = []
for idx, text in enumerate(texts):
record = await self.sdb.create(
self.collection, {"text": text, "embedding": embeddings[idx]}
)
ids.append(record[0]["id"])
return ids
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Add list of text along with embeddings to the vector store
Args:
texts (Iterable[str]): collection of text to add to the database
Returns:
List of ids for the newly inserted documents
"""
return asyncio.run(self.aadd_texts(texts, metadatas, **kwargs))
async def adelete(
self,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> Optional[bool]:
"""Delete by document ID asynchronously.
Args:
ids: List of ids to delete.
**kwargs: Other keyword arguments that subclasses might use.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise.
"""
if ids is None:
await self.sdb.delete(self.collection)
return True
else:
if isinstance(ids, str):
await self.sdb.delete(ids)
return True
else:
if isinstance(ids, list) and len(ids) > 0:
_ = [await self.sdb.delete(id) for id in ids]
return True
return False
def delete(
self,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> Optional[bool]:
"""Delete by document ID.
Args:
ids: List of ids to delete.
**kwargs: Other keyword arguments that subclasses might use.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise.
"""
async def _delete(ids: Optional[List[str]], **kwargs: Any) -> Optional[bool]:
await self.initialize()
return await self.adelete(ids=ids, **kwargs)
return asyncio.run(_delete(ids, **kwargs))
async def _asimilarity_search_by_vector_with_score(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Run similarity search for query embedding asynchronously
and return documents and scores
Args:
embedding (List[float]): Query embedding.
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar along with scores
"""
args = {
"collection": self.collection,
"embedding": embedding,
"k": k,
"score_threshold": kwargs.get("score_threshold", 0),
}
query = """select id, text,
vector::similarity::cosine(embedding,{embedding}) as similarity
from {collection}
where vector::similarity::cosine(embedding,{embedding}) >= {score_threshold}
order by similarity desc LIMIT {k}
""".format(**args)
results = await self.sdb.query(query)
if len(results) == 0:
return []
return [
(
Document(page_content=result["text"], metadata={"id": result["id"]}),
result["similarity"],
)
for result in results[0]["result"]
]
async def asimilarity_search_with_relevance_scores(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Run similarity search asynchronously and return relevance scores
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar along with relevance scores
"""
query_embedding = self.embedding_function.embed_query(query)
return [
(document, similarity)
for document, similarity in (
await self._asimilarity_search_by_vector_with_score(
query_embedding, k, **kwargs
)
)
]
def similarity_search_with_relevance_scores(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Run similarity search synchronously and return relevance scores
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar along with relevance scores
"""
async def _similarity_search_with_relevance_scores() -> (
List[Tuple[Document, float]]
):
await self.initialize()
return await self.asimilarity_search_with_relevance_scores(
query, k, **kwargs
)
return asyncio.run(_similarity_search_with_relevance_scores())
async def asimilarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Run similarity search asynchronously and return distance scores
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar along with relevance distance scores
"""
query_embedding = self.embedding_function.embed_query(query)
return [
(document, similarity)
for document, similarity in (
await self._asimilarity_search_by_vector_with_score(
query_embedding, k, **kwargs
)
)
]
def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Run similarity search synchronously and return distance scores
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar along with relevance distance scores
"""
async def _similarity_search_with_score() -> List[Tuple[Document, float]]:
await self.initialize()
return await self.asimilarity_search_with_score(query, k, **kwargs)
return asyncio.run(_similarity_search_with_score())
async def asimilarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Run similarity search on query embedding asynchronously
Args:
embedding (List[float]): Query embedding
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar to the query
"""
return [
document
for document, _ in await self._asimilarity_search_by_vector_with_score(
embedding, k, **kwargs
)
]
def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Run similarity search on query embedding
Args:
embedding (List[float]): Query embedding
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar to the query
"""
async def _similarity_search_by_vector() -> List[Document]:
await self.initialize()
return await self.asimilarity_search_by_vector(embedding, k, **kwargs)
return asyncio.run(_similarity_search_by_vector())
async def asimilarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Run similarity search on query asynchronously
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar to the query
"""
query_embedding = self.embedding_function.embed_query(query)
return await self.asimilarity_search_by_vector(query_embedding, k, **kwargs)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Run similarity search on query
Args:
query (str): Query
k (int): Number of results to return. Defaults to 4.
Returns:
List of Documents most similar to the query
"""
async def _similarity_search() -> List[Document]:
await self.initialize()
return await self.asimilarity_search(query, k, **kwargs)
return asyncio.run(_similarity_search())
@classmethod
async def afrom_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "SurrealDBStore":
"""Create SurrealDBStore from list of text asynchronously
Args:
texts (List[str]): list of text to vectorize and store
embedding (Optional[Embeddings]): Embedding function.
dburl (str): SurrealDB connection url
(default: "ws://localhost:8000/rpc")
ns (str): surrealdb namespace for the vector store.
(default: "langchain")
db (str): surrealdb database for the vector store.
(default: "database")
collection (str): surrealdb collection for the vector store.
(default: "documents")
(optional) db_user and db_pass: surrealdb credentials
Returns:
SurrealDBStore object initialized and ready for use."""
sdb = cls(embedding, **kwargs)
await sdb.initialize()
await sdb.aadd_texts(texts)
return sdb
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "SurrealDBStore":
"""Create SurrealDBStore from list of text
Args:
texts (List[str]): list of text to vectorize and store
embedding (Optional[Embeddings]): Embedding function.
dburl (str): SurrealDB connection url
ns (str): surrealdb namespace for the vector store.
(default: "langchain")
db (str): surrealdb database for the vector store.
(default: "database")
collection (str): surrealdb collection for the vector store.
(default: "documents")
(optional) db_user and db_pass: surrealdb credentials
Returns:
SurrealDBStore object initialized and ready for use."""
sdb = asyncio.run(cls.afrom_texts(texts, embedding, metadatas, **kwargs))
return sdb

@ -53,6 +53,7 @@ _EXPECTED = [
"SQLiteVSS",
"StarRocks",
"SupabaseVectorStore",
"SurrealDBStore",
"Tair",
"TileDB",
"Tigris",

Loading…
Cancel
Save