2023-12-15 21:34:51 +00:00
|
|
|
import asyncio
|
|
|
|
from typing import (
|
|
|
|
Any,
|
2024-05-22 22:53:55 +00:00
|
|
|
Dict,
|
2023-12-15 21:34:51 +00:00
|
|
|
Iterable,
|
|
|
|
List,
|
|
|
|
Optional,
|
|
|
|
Tuple,
|
|
|
|
)
|
|
|
|
|
2024-05-22 22:53:55 +00:00
|
|
|
import numpy as np
|
2023-12-15 21:34:51 +00:00
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_core.embeddings import Embeddings
|
|
|
|
from langchain_core.vectorstores import VectorStore
|
|
|
|
|
2024-05-22 22:53:55 +00:00
|
|
|
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
|
|
|
|
|
|
|
DEFAULT_K = 4 # Number of Documents to return.
|
|
|
|
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
class SurrealDBStore(VectorStore):
|
|
|
|
"""
|
|
|
|
SurrealDB as Vector Store.
|
|
|
|
|
|
|
|
To use, you should have the ``surrealdb`` python package installed.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding_function: Embedding function to use.
|
|
|
|
dburl: SurrealDB connection url
|
|
|
|
ns: surrealdb namespace for the vector store. (default: "langchain")
|
|
|
|
db: surrealdb database for the vector store. (default: "database")
|
|
|
|
collection: surrealdb collection for the vector store.
|
|
|
|
(default: "documents")
|
|
|
|
|
|
|
|
(optional) db_user and db_pass: surrealdb credentials
|
|
|
|
|
|
|
|
Example:
|
|
|
|
.. code-block:: python
|
|
|
|
|
2024-01-02 21:47:11 +00:00
|
|
|
from langchain_community.vectorstores.surrealdb import SurrealDBStore
|
2024-01-02 20:32:16 +00:00
|
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
embedding_function = HuggingFaceEmbeddings()
|
|
|
|
dburl = "ws://localhost:8000/rpc"
|
|
|
|
ns = "langchain"
|
|
|
|
db = "docstore"
|
|
|
|
collection = "documents"
|
|
|
|
db_user = "root"
|
|
|
|
db_pass = "root"
|
|
|
|
|
|
|
|
sdb = SurrealDBStore.from_texts(
|
|
|
|
texts=texts,
|
|
|
|
embedding=embedding_function,
|
|
|
|
dburl,
|
|
|
|
ns, db, collection,
|
|
|
|
db_user=db_user, db_pass=db_pass)
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
embedding_function: Embeddings,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> None:
|
2024-01-15 18:31:59 +00:00
|
|
|
try:
|
|
|
|
from surrealdb import Surreal
|
|
|
|
except ImportError as e:
|
|
|
|
raise ImportError(
|
|
|
|
"""Cannot import from surrealdb.
|
|
|
|
please install with `pip install surrealdb`."""
|
|
|
|
) from e
|
|
|
|
|
|
|
|
self.dburl = kwargs.pop("dburl", "ws://localhost:8000/rpc")
|
|
|
|
|
|
|
|
if self.dburl[0:2] == "ws":
|
|
|
|
self.sdb = Surreal(self.dburl)
|
|
|
|
else:
|
|
|
|
raise ValueError("Only websocket connections are supported at this time.")
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
self.ns = kwargs.pop("ns", "langchain")
|
|
|
|
self.db = kwargs.pop("db", "database")
|
2024-01-15 18:31:59 +00:00
|
|
|
self.collection = kwargs.pop("collection", "documents")
|
2023-12-15 21:34:51 +00:00
|
|
|
self.embedding_function = embedding_function
|
|
|
|
self.kwargs = kwargs
|
|
|
|
|
|
|
|
async def initialize(self) -> None:
|
|
|
|
"""
|
|
|
|
Initialize connection to surrealdb database
|
|
|
|
and authenticate if credentials are provided
|
|
|
|
"""
|
2024-01-15 18:31:59 +00:00
|
|
|
await self.sdb.connect()
|
2023-12-15 21:34:51 +00:00
|
|
|
if "db_user" in self.kwargs and "db_pass" in self.kwargs:
|
|
|
|
user = self.kwargs.get("db_user")
|
|
|
|
password = self.kwargs.get("db_pass")
|
|
|
|
await self.sdb.signin({"user": user, "pass": password})
|
|
|
|
await self.sdb.use(self.ns, self.db)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def embeddings(self) -> Optional[Embeddings]:
|
|
|
|
return (
|
|
|
|
self.embedding_function
|
|
|
|
if isinstance(self.embedding_function, Embeddings)
|
|
|
|
else None
|
|
|
|
)
|
|
|
|
|
|
|
|
async def aadd_texts(
|
|
|
|
self,
|
|
|
|
texts: Iterable[str],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[str]:
|
|
|
|
"""Add list of text along with embeddings to the vector store asynchronously
|
|
|
|
|
|
|
|
Args:
|
|
|
|
texts (Iterable[str]): collection of text to add to the database
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of ids for the newly inserted documents
|
|
|
|
"""
|
|
|
|
embeddings = self.embedding_function.embed_documents(list(texts))
|
|
|
|
ids = []
|
|
|
|
for idx, text in enumerate(texts):
|
2023-12-21 17:04:57 +00:00
|
|
|
data = {"text": text, "embedding": embeddings[idx]}
|
|
|
|
if metadatas is not None and idx < len(metadatas):
|
2024-02-05 19:22:06 +00:00
|
|
|
data["metadata"] = metadatas[idx] # type: ignore[assignment]
|
2024-02-19 18:54:52 +00:00
|
|
|
else:
|
|
|
|
data["metadata"] = []
|
2023-12-15 21:34:51 +00:00
|
|
|
record = await self.sdb.create(
|
2023-12-21 17:04:57 +00:00
|
|
|
self.collection,
|
|
|
|
data,
|
2023-12-15 21:34:51 +00:00
|
|
|
)
|
|
|
|
ids.append(record[0]["id"])
|
|
|
|
return ids
|
|
|
|
|
|
|
|
def add_texts(
|
|
|
|
self,
|
|
|
|
texts: Iterable[str],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[str]:
|
|
|
|
"""Add list of text along with embeddings to the vector store
|
|
|
|
|
|
|
|
Args:
|
|
|
|
texts (Iterable[str]): collection of text to add to the database
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of ids for the newly inserted documents
|
|
|
|
"""
|
2023-12-21 17:04:57 +00:00
|
|
|
|
|
|
|
async def _add_texts(
|
|
|
|
texts: Iterable[str],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[str]:
|
|
|
|
await self.initialize()
|
|
|
|
return await self.aadd_texts(texts, metadatas, **kwargs)
|
|
|
|
|
|
|
|
return asyncio.run(_add_texts(texts, metadatas, **kwargs))
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
async def adelete(
|
|
|
|
self,
|
|
|
|
ids: Optional[List[str]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> Optional[bool]:
|
|
|
|
"""Delete by document ID asynchronously.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
ids: List of ids to delete.
|
|
|
|
**kwargs: Other keyword arguments that subclasses might use.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Optional[bool]: True if deletion is successful,
|
|
|
|
False otherwise.
|
|
|
|
"""
|
|
|
|
|
|
|
|
if ids is None:
|
|
|
|
await self.sdb.delete(self.collection)
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
if isinstance(ids, str):
|
|
|
|
await self.sdb.delete(ids)
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
if isinstance(ids, list) and len(ids) > 0:
|
|
|
|
_ = [await self.sdb.delete(id) for id in ids]
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
def delete(
|
|
|
|
self,
|
|
|
|
ids: Optional[List[str]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> Optional[bool]:
|
|
|
|
"""Delete by document ID.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
ids: List of ids to delete.
|
|
|
|
**kwargs: Other keyword arguments that subclasses might use.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Optional[bool]: True if deletion is successful,
|
|
|
|
False otherwise.
|
|
|
|
"""
|
|
|
|
|
|
|
|
async def _delete(ids: Optional[List[str]], **kwargs: Any) -> Optional[bool]:
|
|
|
|
await self.initialize()
|
|
|
|
return await self.adelete(ids=ids, **kwargs)
|
|
|
|
|
|
|
|
return asyncio.run(_delete(ids, **kwargs))
|
|
|
|
|
|
|
|
async def _asimilarity_search_by_vector_with_score(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float, Any]]:
|
2023-12-15 21:34:51 +00:00
|
|
|
"""Run similarity search for query embedding asynchronously
|
|
|
|
and return documents and scores
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding (List[float]): Query embedding.
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar along with scores
|
|
|
|
"""
|
|
|
|
args = {
|
|
|
|
"collection": self.collection,
|
|
|
|
"embedding": embedding,
|
|
|
|
"k": k,
|
|
|
|
"score_threshold": kwargs.get("score_threshold", 0),
|
|
|
|
}
|
2024-05-22 22:53:55 +00:00
|
|
|
|
|
|
|
# build additional filter criteria
|
|
|
|
custom_filter = ""
|
|
|
|
if filter:
|
|
|
|
for key in filter:
|
|
|
|
# check value type
|
|
|
|
if type(filter[key]) in [str, bool]:
|
|
|
|
filter_value = f"'{filter[key]}'"
|
|
|
|
else:
|
|
|
|
filter_value = f"{filter[key]}"
|
|
|
|
|
|
|
|
custom_filter += f"and metadata.{key} = {filter_value} "
|
|
|
|
|
2024-03-29 01:04:27 +00:00
|
|
|
query = f"""
|
|
|
|
select
|
|
|
|
id,
|
|
|
|
text,
|
|
|
|
metadata,
|
2024-05-22 22:53:55 +00:00
|
|
|
embedding,
|
2024-03-29 01:04:27 +00:00
|
|
|
vector::similarity::cosine(embedding, $embedding) as similarity
|
|
|
|
from ⟨{args["collection"]}⟩
|
|
|
|
where vector::similarity::cosine(embedding, $embedding) >= $score_threshold
|
2024-05-22 22:53:55 +00:00
|
|
|
{custom_filter}
|
2024-03-29 01:04:27 +00:00
|
|
|
order by similarity desc LIMIT $k;
|
|
|
|
"""
|
|
|
|
results = await self.sdb.query(query, args)
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
if len(results) == 0:
|
|
|
|
return []
|
|
|
|
|
2024-03-29 01:04:27 +00:00
|
|
|
result = results[0]
|
|
|
|
|
|
|
|
if result["status"] != "OK":
|
|
|
|
from surrealdb.ws import SurrealException
|
|
|
|
|
|
|
|
err = result.get("result", "Unknown Error")
|
|
|
|
raise SurrealException(err)
|
|
|
|
|
2023-12-15 21:34:51 +00:00
|
|
|
return [
|
|
|
|
(
|
2023-12-21 17:04:57 +00:00
|
|
|
Document(
|
2024-03-29 01:04:27 +00:00
|
|
|
page_content=doc["text"],
|
2024-05-22 22:53:55 +00:00
|
|
|
metadata={"id": doc["id"], **(doc.get("metadata") or {})},
|
2023-12-21 17:04:57 +00:00
|
|
|
),
|
2024-03-29 01:04:27 +00:00
|
|
|
doc["similarity"],
|
2024-05-22 22:53:55 +00:00
|
|
|
doc["embedding"],
|
2023-12-15 21:34:51 +00:00
|
|
|
)
|
2024-03-29 01:04:27 +00:00
|
|
|
for doc in result["result"]
|
2023-12-15 21:34:51 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
async def asimilarity_search_with_relevance_scores(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-15 21:34:51 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Run similarity search asynchronously and return relevance scores
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Query
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar along with relevance scores
|
|
|
|
"""
|
|
|
|
query_embedding = self.embedding_function.embed_query(query)
|
|
|
|
return [
|
|
|
|
(document, similarity)
|
2024-05-22 22:53:55 +00:00
|
|
|
for document, similarity, _ in (
|
2023-12-15 21:34:51 +00:00
|
|
|
await self._asimilarity_search_by_vector_with_score(
|
2024-05-22 22:53:55 +00:00
|
|
|
query_embedding, k, filter=filter, **kwargs
|
2023-12-15 21:34:51 +00:00
|
|
|
)
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
def similarity_search_with_relevance_scores(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-15 21:34:51 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Run similarity search synchronously and return relevance scores
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Query
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar along with relevance scores
|
|
|
|
"""
|
|
|
|
|
|
|
|
async def _similarity_search_with_relevance_scores() -> (
|
|
|
|
List[Tuple[Document, float]]
|
|
|
|
):
|
|
|
|
await self.initialize()
|
|
|
|
return await self.asimilarity_search_with_relevance_scores(
|
2024-05-22 22:53:55 +00:00
|
|
|
query, k, filter=filter, **kwargs
|
2023-12-15 21:34:51 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
return asyncio.run(_similarity_search_with_relevance_scores())
|
|
|
|
|
|
|
|
async def asimilarity_search_with_score(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-15 21:34:51 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Run similarity search asynchronously and return distance scores
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Query
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar along with relevance distance scores
|
|
|
|
"""
|
|
|
|
query_embedding = self.embedding_function.embed_query(query)
|
|
|
|
return [
|
|
|
|
(document, similarity)
|
2024-05-22 22:53:55 +00:00
|
|
|
for document, similarity, _ in (
|
2023-12-15 21:34:51 +00:00
|
|
|
await self._asimilarity_search_by_vector_with_score(
|
2024-05-22 22:53:55 +00:00
|
|
|
query_embedding, k, filter=filter, **kwargs
|
2023-12-15 21:34:51 +00:00
|
|
|
)
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
def similarity_search_with_score(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-15 21:34:51 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Run similarity search synchronously and return distance scores
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Query
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar along with relevance distance scores
|
|
|
|
"""
|
|
|
|
|
|
|
|
async def _similarity_search_with_score() -> List[Tuple[Document, float]]:
|
|
|
|
await self.initialize()
|
2024-05-22 22:53:55 +00:00
|
|
|
return await self.asimilarity_search_with_score(
|
|
|
|
query, k, filter=filter, **kwargs
|
|
|
|
)
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
return asyncio.run(_similarity_search_with_score())
|
|
|
|
|
|
|
|
async def asimilarity_search_by_vector(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-15 21:34:51 +00:00
|
|
|
) -> List[Document]:
|
|
|
|
"""Run similarity search on query embedding asynchronously
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding (List[float]): Query embedding
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query
|
|
|
|
"""
|
|
|
|
return [
|
|
|
|
document
|
2024-05-22 22:53:55 +00:00
|
|
|
for document, _, _ in await self._asimilarity_search_by_vector_with_score(
|
|
|
|
embedding, k, filter=filter, **kwargs
|
2023-12-15 21:34:51 +00:00
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
def similarity_search_by_vector(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-15 21:34:51 +00:00
|
|
|
) -> List[Document]:
|
|
|
|
"""Run similarity search on query embedding
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding (List[float]): Query embedding
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query
|
|
|
|
"""
|
|
|
|
|
|
|
|
async def _similarity_search_by_vector() -> List[Document]:
|
|
|
|
await self.initialize()
|
2024-05-22 22:53:55 +00:00
|
|
|
return await self.asimilarity_search_by_vector(
|
|
|
|
embedding, k, filter=filter, **kwargs
|
|
|
|
)
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
return asyncio.run(_similarity_search_by_vector())
|
|
|
|
|
|
|
|
async def asimilarity_search(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-15 21:34:51 +00:00
|
|
|
) -> List[Document]:
|
|
|
|
"""Run similarity search on query asynchronously
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Query
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query
|
|
|
|
"""
|
|
|
|
query_embedding = self.embedding_function.embed_query(query)
|
2024-05-22 22:53:55 +00:00
|
|
|
return await self.asimilarity_search_by_vector(
|
|
|
|
query_embedding, k, filter=filter, **kwargs
|
|
|
|
)
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
def similarity_search(
|
2024-05-22 22:53:55 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-15 21:34:51 +00:00
|
|
|
) -> List[Document]:
|
|
|
|
"""Run similarity search on query
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Query
|
|
|
|
k (int): Number of results to return. Defaults to 4.
|
2024-05-22 22:53:55 +00:00
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query
|
|
|
|
"""
|
|
|
|
|
|
|
|
async def _similarity_search() -> List[Document]:
|
|
|
|
await self.initialize()
|
2024-05-22 22:53:55 +00:00
|
|
|
return await self.asimilarity_search(query, k, filter=filter, **kwargs)
|
2023-12-15 21:34:51 +00:00
|
|
|
|
|
|
|
return asyncio.run(_similarity_search())
|
|
|
|
|
2024-05-22 22:53:55 +00:00
|
|
|
async def amax_marginal_relevance_search_by_vector(
|
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
|
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
|
|
among selected documents.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
Defaults to 0.5.
|
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents selected by maximal marginal relevance.
|
|
|
|
"""
|
|
|
|
|
|
|
|
result = await self._asimilarity_search_by_vector_with_score(
|
|
|
|
embedding, fetch_k, filter=filter, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
# extract only document from result
|
|
|
|
docs = [sub[0] for sub in result]
|
|
|
|
# extract only embedding from result
|
|
|
|
embeddings = [sub[-1] for sub in result]
|
|
|
|
|
|
|
|
mmr_selected = maximal_marginal_relevance(
|
|
|
|
np.array(embedding, dtype=np.float32),
|
|
|
|
embeddings,
|
|
|
|
k=k,
|
|
|
|
lambda_mult=lambda_mult,
|
|
|
|
)
|
|
|
|
|
|
|
|
return [docs[i] for i in mmr_selected]
|
|
|
|
|
|
|
|
def max_marginal_relevance_search_by_vector(
|
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
|
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
|
|
among selected documents.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
Defaults to 0.5.
|
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents selected by maximal marginal relevance.
|
|
|
|
"""
|
|
|
|
|
|
|
|
async def _max_marginal_relevance_search_by_vector() -> List[Document]:
|
|
|
|
await self.initialize()
|
|
|
|
return await self.amax_marginal_relevance_search_by_vector(
|
|
|
|
embedding, k, fetch_k, lambda_mult, filter=filter, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
return asyncio.run(_max_marginal_relevance_search_by_vector())
|
|
|
|
|
|
|
|
async def amax_marginal_relevance_search(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
|
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
|
|
among selected documents.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Text to look up documents similar to.
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
Defaults to 0.5.
|
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents selected by maximal marginal relevance.
|
|
|
|
"""
|
|
|
|
|
|
|
|
embedding = self.embedding_function.embed_query(query)
|
|
|
|
docs = await self.amax_marginal_relevance_search_by_vector(
|
|
|
|
embedding, k, fetch_k, lambda_mult, filter=filter, **kwargs
|
|
|
|
)
|
|
|
|
return docs
|
|
|
|
|
|
|
|
def max_marginal_relevance_search(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = DEFAULT_K,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
*,
|
|
|
|
filter: Optional[Dict[str, str]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
|
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
|
|
among selected documents.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Text to look up documents similar to.
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
Defaults to 0.5.
|
|
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents selected by maximal marginal relevance.
|
|
|
|
"""
|
|
|
|
|
|
|
|
async def _max_marginal_relevance_search() -> List[Document]:
|
|
|
|
await self.initialize()
|
|
|
|
return await self.amax_marginal_relevance_search(
|
|
|
|
query, k, fetch_k, lambda_mult, filter=filter, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
return asyncio.run(_max_marginal_relevance_search())
|
|
|
|
|
2023-12-15 21:34:51 +00:00
|
|
|
@classmethod
|
|
|
|
async def afrom_texts(
|
|
|
|
cls,
|
|
|
|
texts: List[str],
|
|
|
|
embedding: Embeddings,
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> "SurrealDBStore":
|
|
|
|
"""Create SurrealDBStore from list of text asynchronously
|
|
|
|
|
|
|
|
Args:
|
|
|
|
texts (List[str]): list of text to vectorize and store
|
|
|
|
embedding (Optional[Embeddings]): Embedding function.
|
|
|
|
dburl (str): SurrealDB connection url
|
|
|
|
(default: "ws://localhost:8000/rpc")
|
|
|
|
ns (str): surrealdb namespace for the vector store.
|
|
|
|
(default: "langchain")
|
|
|
|
db (str): surrealdb database for the vector store.
|
|
|
|
(default: "database")
|
|
|
|
collection (str): surrealdb collection for the vector store.
|
|
|
|
(default: "documents")
|
|
|
|
|
|
|
|
(optional) db_user and db_pass: surrealdb credentials
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
SurrealDBStore object initialized and ready for use."""
|
|
|
|
|
|
|
|
sdb = cls(embedding, **kwargs)
|
|
|
|
await sdb.initialize()
|
2023-12-21 17:04:57 +00:00
|
|
|
await sdb.aadd_texts(texts, metadatas, **kwargs)
|
2023-12-15 21:34:51 +00:00
|
|
|
return sdb
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_texts(
|
|
|
|
cls,
|
|
|
|
texts: List[str],
|
|
|
|
embedding: Embeddings,
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> "SurrealDBStore":
|
|
|
|
"""Create SurrealDBStore from list of text
|
|
|
|
|
|
|
|
Args:
|
|
|
|
texts (List[str]): list of text to vectorize and store
|
|
|
|
embedding (Optional[Embeddings]): Embedding function.
|
|
|
|
dburl (str): SurrealDB connection url
|
|
|
|
ns (str): surrealdb namespace for the vector store.
|
|
|
|
(default: "langchain")
|
|
|
|
db (str): surrealdb database for the vector store.
|
|
|
|
(default: "database")
|
|
|
|
collection (str): surrealdb collection for the vector store.
|
|
|
|
(default: "documents")
|
|
|
|
|
|
|
|
(optional) db_user and db_pass: surrealdb credentials
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
SurrealDBStore object initialized and ready for use."""
|
|
|
|
sdb = asyncio.run(cls.afrom_texts(texts, embedding, metadatas, **kwargs))
|
|
|
|
return sdb
|