diff --git a/docs/modules/indexes/vectorstores/examples/singlestoredb.ipynb b/docs/modules/indexes/vectorstores/examples/singlestoredb.ipynb new file mode 100644 index 00000000..90a594f3 --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/singlestoredb.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2b9582dc", + "metadata": {}, + "source": [ + "# SingleStoreDB vector search\n", + "[SingleStore DB](https://singlestore.com) is a high-performance distributed database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. For a significant duration, it has provided support for vector functions such as [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html), thereby positioning itself as an ideal solution for AI applications that require text similarity matching. \n", + "This tutorial illustrates how to utilize the features of the SingleStore DB Vector Store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4a61a4d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Establishing a connection to the database is facilitated through the singlestoredb Python connector.\n", + "# Please ensure that this connector is installed in your working environment.\n", + "!pip install singlestoredb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39a0132a", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "# We want to use OpenAIEmbeddings so we have to get the OpenAI API Key.\n", + "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6104fde8", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import SingleStoreDB\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b45113c", + "metadata": {}, + "outputs": [], + "source": [ + "# Load text samples \n", + "from langchain.document_loaders import TextLoader\n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "markdown", + "id": "535b2687", + "metadata": {}, + "source": [ + "There are several ways to establish a [connection](https://singlestoredb-python.labs.singlestore.com/generated/singlestoredb.connect.html) to the database. You can either set up environment variables or pass named parameters to the `SingleStoreDB constructor`. Alternatively, you may provide these parameters to the `from_documents` and `from_texts` methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0b316bf", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup connection url as environment variable\n", + "os.environ['SINGLESTOREDB_URL'] = 'root:pass@localhost:3306/db'\n", + "\n", + "# Load documents to the store\n", + "docsearch = SingleStoreDB.from_documents(\n", + " docs,\n", + " embeddings,\n", + " table_name = \"noteook\", # use table with a custom name \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0eaa4297", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(query) # Find documents that correspond to the query\n", + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86efff90", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 7ba8f73c..c2c37933 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -18,6 +18,7 @@ from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSear from langchain.vectorstores.pinecone import Pinecone from langchain.vectorstores.qdrant import Qdrant from langchain.vectorstores.redis import Redis +from langchain.vectorstores.singlestoredb import SingleStoreDB from langchain.vectorstores.sklearn import SKLearnVectorStore from langchain.vectorstores.supabase import SupabaseVectorStore from langchain.vectorstores.tair import Tair @@ -37,6 +38,7 @@ __all__ = [ "Qdrant", "Milvus", "Zilliz", + "SingleStoreDB", "Chroma", "OpenSearchVectorSearch", "AtlasDB", diff --git a/langchain/vectorstores/base.py b/langchain/vectorstores/base.py index a789d358..7a708596 100644 --- a/langchain/vectorstores/base.py +++ b/langchain/vectorstores/base.py @@ -5,7 +5,18 @@ import asyncio import warnings from abc import ABC, abstractmethod from functools import partial -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar +from typing import ( + Any, + ClassVar, + Collection, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, + TypeVar, +) from pydantic import BaseModel, Field, root_validator @@ -347,6 +358,11 @@ class VectorStoreRetriever(BaseRetriever, BaseModel): vectorstore: VectorStore search_type: str = "similarity" search_kwargs: dict = Field(default_factory=dict) + allowed_search_types: ClassVar[Collection[str]] = ( + "similarity", + "similarity_score_threshold", + "mmr", + ) class Config: """Configuration for this pydantic object.""" @@ -356,19 +372,19 @@ class VectorStoreRetriever(BaseRetriever, BaseModel): @root_validator() def validate_search_type(cls, values: Dict) -> Dict: """Validate search type.""" - if "search_type" in values: - search_type = values["search_type"] - if search_type not in ("similarity", "similarity_score_threshold", "mmr"): - raise ValueError(f"search_type of {search_type} not allowed.") - if search_type == "similarity_score_threshold": - score_threshold = values["search_kwargs"].get("score_threshold") - if (score_threshold is None) or ( - not isinstance(score_threshold, float) - ): - raise ValueError( - "`score_threshold` is not specified with a float value(0~1) " - "in `search_kwargs`." - ) + search_type = values["search_type"] + if search_type not in cls.allowed_search_types: + raise ValueError( + f"search_type of {search_type} not allowed. Valid values are: " + f"{cls.allowed_search_types}" + ) + if search_type == "similarity_score_threshold": + score_threshold = values["search_kwargs"].get("score_threshold") + if (score_threshold is None) or (not isinstance(score_threshold, float)): + raise ValueError( + "`score_threshold` is not specified with a float value(0~1) " + "in `search_kwargs`." + ) return values def get_relevant_documents(self, query: str) -> List[Document]: diff --git a/langchain/vectorstores/singlestoredb.py b/langchain/vectorstores/singlestoredb.py new file mode 100644 index 00000000..b5a6ad77 --- /dev/null +++ b/langchain/vectorstores/singlestoredb.py @@ -0,0 +1,372 @@ +"""Wrapper around SingleStore DB.""" +from __future__ import annotations + +import json +from typing import ( + Any, + ClassVar, + Collection, + Iterable, + List, + Optional, + Tuple, + Type, +) + +from sqlalchemy.pool import QueuePool + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore, VectorStoreRetriever + + +class SingleStoreDB(VectorStore): + """ + This class serves as a Pythonic interface to the SingleStore DB database. + The prerequisite for using this class is the installation of the ``singlestoredb`` + Python package. + + The SingleStoreDB vectorstore can be created by providing an embedding function and + the relevant parameters for the database connection, connection pool, and + optionally, the names of the table and the fields to use. + """ + + def _get_connection(self: SingleStoreDB) -> Any: + try: + import singlestoredb as s2 + except ImportError: + raise ImportError( + "Could not import singlestoredb python package. " + "Please install it with `pip install singlestoredb`." + ) + return s2.connect(**self.connection_kwargs) + + def __init__( + self, + embedding: Embeddings, + *, + table_name: str = "embeddings", + content_field: str = "content", + metadata_field: str = "metadata", + vector_field: str = "vector", + pool_size: int = 5, + max_overflow: int = 10, + timeout: float = 30, + **kwargs: Any, + ): + """Initialize with necessary components. + + Args: + embedding (Embeddings): A text embedding model. + + table_name (str, optional): Specifies the name of the table in use. + Defaults to "embeddings". + content_field (str, optional): Specifies the field to store the content. + Defaults to "content". + metadata_field (str, optional): Specifies the field to store metadata. + Defaults to "metadata". + vector_field (str, optional): Specifies the field to store the vector. + Defaults to "vector". + + Following arguments pertain to the connection pool: + + pool_size (int, optional): Determines the number of active connections in + the pool. Defaults to 5. + max_overflow (int, optional): Determines the maximum number of connections + allowed beyond the pool_size. Defaults to 10. + timeout (float, optional): Specifies the maximum wait time in seconds for + establishing a connection. Defaults to 30. + + Following arguments pertain to the database connection: + + host (str, optional): Specifies the hostname, IP address, or URL for the + database connection. The default scheme is "mysql". + user (str, optional): Database username. + password (str, optional): Database password. + port (int, optional): Database port. Defaults to 3306 for non-HTTP + connections, 80 for HTTP connections, and 443 for HTTPS connections. + database (str, optional): Database name. + + Additional optional arguments provide further customization over the + database connection: + + pure_python (bool, optional): Toggles the connector mode. If True, + operates in pure Python mode. + local_infile (bool, optional): Allows local file uploads. + charset (str, optional): Specifies the character set for string values. + ssl_key (str, optional): Specifies the path of the file containing the SSL + key. + ssl_cert (str, optional): Specifies the path of the file containing the SSL + certificate. + ssl_ca (str, optional): Specifies the path of the file containing the SSL + certificate authority. + ssl_cipher (str, optional): Sets the SSL cipher list. + ssl_disabled (bool, optional): Disables SSL usage. + ssl_verify_cert (bool, optional): Verifies the server's certificate. + Automatically enabled if ``ssl_ca`` is specified. + ssl_verify_identity (bool, optional): Verifies the server's identity. + conv (dict[int, Callable], optional): A dictionary of data conversion + functions. + credential_type (str, optional): Specifies the type of authentication to + use: auth.PASSWORD, auth.JWT, or auth.BROWSER_SSO. + autocommit (bool, optional): Enables autocommits. + results_type (str, optional): Determines the structure of the query results: + tuples, namedtuples, dicts. + results_format (str, optional): Deprecated. This option has been renamed to + results_type. + + Examples: + Basic Usage: + + .. code-block:: python + + from langchain.embeddings import OpenAIEmbeddings + from langchain.vectorstores import SingleStoreDB + + vectorstore = SingleStoreDB( + OpenAIEmbeddings(), + host="https://user:password@127.0.0.1:3306/database" + ) + + Advanced Usage: + + .. code-block:: python + + from langchain.embeddings import OpenAIEmbeddings + from langchain.vectorstores import SingleStoreDB + + vectorstore = SingleStoreDB( + OpenAIEmbeddings(), + host="127.0.0.1", + port=3306, + user="user", + password="password", + database="db", + table_name="my_custom_table", + pool_size=10, + timeout=60, + ) + + Using environment variables: + + .. code-block:: python + + from langchain.embeddings import OpenAIEmbeddings + from langchain.vectorstores import SingleStoreDB + + os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db' + vectorstore = SingleStoreDB(OpenAIEmbeddings()) + """ + + self.embedding = embedding + self.table_name = table_name + self.content_field = content_field + self.metadata_field = metadata_field + self.vector_field = vector_field + + """Pass the rest of the kwargs to the connection.""" + self.connection_kwargs = kwargs + + """Create connection pool.""" + self.connection_pool = QueuePool( + self._get_connection, + max_overflow=max_overflow, + pool_size=pool_size, + timeout=timeout, + ) + self._create_table() + + def _create_table(self: SingleStoreDB) -> None: + """Create table if it doesn't exist.""" + conn = self.connection_pool.connect() + try: + cur = conn.cursor() + try: + cur.execute( + """CREATE TABLE IF NOT EXISTS {} + ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, + {} BLOB, {} JSON);""".format( + self.table_name, + self.content_field, + self.vector_field, + self.metadata_field, + ), + ) + finally: + cur.close() + finally: + conn.close() + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + embeddings: Optional[List[List[float]]] = None, + **kwargs: Any, + ) -> List[str]: + """Add more texts to the vectorstore. + + Args: + texts (Iterable[str]): Iterable of strings/text to add to the vectorstore. + metadatas (Optional[List[dict]], optional): Optional list of metadatas. + Defaults to None. + embeddings (Optional[List[List[float]]], optional): Optional pre-generated + embeddings. Defaults to None. + + Returns: + List[str]: empty list + """ + conn = self.connection_pool.connect() + try: + cur = conn.cursor() + try: + # Write data to singlestore db + for i, text in enumerate(texts): + # Use provided values by default or fallback + metadata = metadatas[i] if metadatas else {} + embedding = ( + embeddings[i] + if embeddings + else self.embedding.embed_documents([text])[0] + ) + cur.execute( + "INSERT INTO {} VALUES (%s, JSON_ARRAY_PACK(%s), %s)".format( + self.table_name + ), + ( + text, + "[{}]".format(",".join(map(str, embedding))), + json.dumps(metadata), + ), + ) + finally: + cur.close() + finally: + conn.close() + return [] + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Returns the most similar indexed documents to the query text. + + Uses cosine similarity. + + Args: + query (str): The query text for which to find similar documents. + k (int): The number of documents to return. Default is 4. + + Returns: + List[Document]: A list of documents that are most similar to the query text. + """ + docs_and_scores = self.similarity_search_with_score(query, k=k) + return [doc for doc, _ in docs_and_scores] + + def similarity_search_with_score( + self, query: str, k: int = 4 + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. Uses cosine similarity. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query and score for each + """ + # Creates embedding vector from user query + embedding = self.embedding.embed_query(query) + conn = self.connection_pool.connect() + result = [] + try: + cur = conn.cursor() + try: + cur.execute( + """SELECT {}, {}, DOT_PRODUCT({}, JSON_ARRAY_PACK(%s)) as __score + FROM {} ORDER BY __score DESC LIMIT %s""".format( + self.content_field, + self.metadata_field, + self.vector_field, + self.table_name, + ), + ( + "[{}]".format(",".join(map(str, embedding))), + k, + ), + ) + + for row in cur.fetchall(): + doc = Document(page_content=row[0], metadata=row[1]) + result.append((doc, float(row[2]))) + finally: + cur.close() + finally: + conn.close() + return result + + @classmethod + def from_texts( + cls: Type[SingleStoreDB], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + table_name: str = "embeddings", + content_field: str = "content", + metadata_field: str = "metadata", + vector_field: str = "vector", + pool_size: int = 5, + max_overflow: int = 10, + timeout: float = 30, + **kwargs: Any, + ) -> SingleStoreDB: + """Create a SingleStoreDB vectorstore from raw documents. + This is a user-friendly interface that: + 1. Embeds documents. + 2. Creates a new table for the embeddings in SingleStoreDB. + 3. Adds the documents to the newly created table. + This is intended to be a quick way to get started. + Example: + .. code-block:: python + from langchain.vectorstores import SingleStoreDB + from langchain.embeddings import OpenAIEmbeddings + s2 = SingleStoreDB.from_texts( + texts, + OpenAIEmbeddings(), + host="username:password@localhost:3306/database" + ) + """ + + instance = cls( + embedding, + table_name=table_name, + content_field=content_field, + metadata_field=metadata_field, + vector_field=vector_field, + pool_size=pool_size, + max_overflow=max_overflow, + timeout=timeout, + **kwargs, + ) + instance.add_texts(texts, metadatas, embedding.embed_documents(texts), **kwargs) + return instance + + def as_retriever(self, **kwargs: Any) -> SingleStoreDBRetriever: + return SingleStoreDBRetriever(vectorstore=self, **kwargs) + + +class SingleStoreDBRetriever(VectorStoreRetriever): + vectorstore: SingleStoreDB + k: int = 4 + allowed_search_types: ClassVar[Collection[str]] = ("similarity",) + + def get_relevant_documents(self, query: str) -> List[Document]: + if self.search_type == "similarity": + docs = self.vectorstore.similarity_search(query, k=self.k) + else: + raise ValueError(f"search_type of {self.search_type} not allowed.") + return docs + + async def aget_relevant_documents(self, query: str) -> List[Document]: + raise NotImplementedError( + "SingleStoreDBVectorStoreRetriever does not support async" + ) diff --git a/poetry.lock b/poetry.lock index 7806d55f..de89033b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -948,6 +948,30 @@ urllib3 = ">=1.25.4,<1.27" [package.extras] crt = ["awscrt (==0.16.9)"] +[[package]] +name = "build" +version = "0.10.0" +description = "A simple, correct Python build frontend" +category = "main" +optional = true +python-versions = ">= 3.7" +files = [ + {file = "build-0.10.0-py3-none-any.whl", hash = "sha256:af266720050a66c893a6096a2f410989eeac74ff9a68ba194b3f6473e8e26171"}, + {file = "build-0.10.0.tar.gz", hash = "sha256:d5b71264afdb5951d6704482aac78de887c80691c52b88a9ad195983ca2c9269"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "os_name == \"nt\""} +packaging = ">=19.0" +pyproject_hooks = "*" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["furo (>=2021.08.31)", "sphinx (>=4.0,<5.0)", "sphinx-argparse-cli (>=1.5)", "sphinx-autodoc-typehints (>=1.10)"] +test = ["filelock (>=3)", "pytest (>=6.2.4)", "pytest-cov (>=2.12)", "pytest-mock (>=2)", "pytest-rerunfailures (>=9.1)", "pytest-xdist (>=1.34)", "setuptools (>=42.0.0)", "setuptools (>=56.0.0)", "toml (>=0.10.0)", "wheel (>=0.36.0)"] +typing = ["importlib-metadata (>=5.1)", "mypy (==0.991)", "tomli", "typing-extensions (>=3.7.4.3)"] +virtualenv = ["virtualenv (>=20.0.35)"] + [[package]] name = "cachetools" version = "5.3.1" @@ -7409,6 +7433,21 @@ files = [ doc = ["sphinx", "sphinx_rtd_theme"] test = ["flake8", "isort", "pytest"] +[[package]] +name = "pyproject-hooks" +version = "1.0.0" +description = "Wrappers to call pyproject.toml-based build backend hooks." +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "pyproject_hooks-1.0.0-py3-none-any.whl", hash = "sha256:283c11acd6b928d2f6a7c73fa0d01cb2bdc5f07c57a2eeb6e83d5e56b97976f8"}, + {file = "pyproject_hooks-1.0.0.tar.gz", hash = "sha256:f271b298b97f5955d53fb12b72c1fb1948c22c1a6b70b315c54cedaca0264ef5"}, +] + +[package.dependencies] +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} + [[package]] name = "pyreadline3" version = "3.4.1" @@ -8603,6 +8642,39 @@ files = [ {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"}, ] +[[package]] +name = "singlestoredb" +version = "0.6.1" +description = "Interface to the SingleStore database and cluster management APIs" +category = "main" +optional = true +python-versions = ">=3.6" +files = [ + {file = "singlestoredb-0.6.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf1769e53993981420650a02c59ba367913d9f0256948cc98f6f9d464f74852a"}, + {file = "singlestoredb-0.6.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e90fa1dfde1e31f7abe011f75d9dc8cccbc35b968ed8381bd44c0b7dd4026b"}, + {file = "singlestoredb-0.6.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44d361c3fa4de6228b525d0b1d22db75790d8e6fb84c3d0b2213bf41774d4323"}, + {file = "singlestoredb-0.6.1-cp36-abi3-win32.whl", hash = "sha256:ad9543c41286a2095718ad7e133cc8b3b5de938f731157fbb2d4d2b0d1623aff"}, + {file = "singlestoredb-0.6.1-cp36-abi3-win_amd64.whl", hash = "sha256:f9f9feda947b9fe9182863758118c8961ebb74281098b42894c99b58d30b2526"}, + {file = "singlestoredb-0.6.1.tar.gz", hash = "sha256:2e00f4cd869dc1ecf33df853c521ebd6ce913af2bf3b2f98675ffa3dc6911636"}, +] + +[package.dependencies] +build = "*" +PyJWT = "*" +requests = "*" +sqlparams = "*" +wheel = "*" + +[package.extras] +dataframe = ["ibis-singlestoredb"] +dbt = ["dbt-singlestore"] +ed22519 = ["PyNaCl (>=1.4.0)"] +gssapi = ["gssapi"] +ibis = ["ibis-singlestoredb"] +kerberos = ["gssapi"] +rsa = ["cryptography"] +sqlalchemy = ["sqlalchemy-singlestoredb"] + [[package]] name = "six" version = "1.16.0" @@ -9137,6 +9209,18 @@ files = [ {file = "sqlitedict-2.1.0.tar.gz", hash = "sha256:03d9cfb96d602996f1d4c2db2856f1224b96a9c431bdd16e78032a72940f9e8c"}, ] +[[package]] +name = "sqlparams" +version = "5.1.0" +description = "Convert between various DB API 2.0 parameter styles." +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "sqlparams-5.1.0-py3-none-any.whl", hash = "sha256:ee4ef620a5197535e5ebb9217e2f453f08b044634b3d890f3d6701e4f838c85c"}, + {file = "sqlparams-5.1.0.tar.gz", hash = "sha256:1abe87a0684567265b2b86f5a482d5c37db237c0268d4c81774ffedce4300199"}, +] + [[package]] name = "srsly" version = "2.4.6" @@ -9842,7 +9926,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -11227,13 +11311,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "lxml", "manifest-ml", "momento", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "requests-toolbelt", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] -azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "openai"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb"] +azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "chardet", "gql", "html2text", "jq", "lxml", "pandas", "pdfminer-six", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "telethon", "tqdm", "zep-python"] -llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] +extended-testing = ["beautifulsoup4", "bibtexparser", "chardet", "jq", "pdfminer-six", "pypdf", "pymupdf", "pypdfium2", "tqdm", "lxml", "atlassian-python-api", "beautifulsoup4", "pandas", "telethon", "psychicapi", "zep-python", "gql", "requests-toolbelt", "html2text", "py-trello", "scikit-learn", "pyspark"] +llms = ["anthropic", "cohere", "openai", "openlm", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] text-helpers = ["chardet"] @@ -11241,4 +11325,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "8c0ab1bdc8b506e38e6fa4cba40dcf2df47473212d47fa1086c6aae8ddf2c021" +content-hash = "faeb3cc6feb059096a66ba8b1fd2271cd91e3a9553cb4f05e5ea493610ac3763" diff --git a/pyproject.toml b/pyproject.toml index 7e06caa9..8221eeaa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,10 +101,12 @@ azure-cognitiveservices-speech = {version = "^1.28.0", optional = true} py-trello = {version = "^0.19.0", optional = true} momento = {version = "^1.5.0", optional = true} bibtexparser = {version = "^1.4.0", optional = true} +singlestoredb = {version = "^0.6.1", optional = true} pyspark = {version = "^3.4.0", optional = true} tigrisdb = {version = "^1.0.0b6", optional = true} langchainplus-sdk = ">=0.0.6" + [tool.poetry.group.docs.dependencies] autodoc_pydantic = "^1.8.0" myst_parser = "^0.18.1" @@ -280,6 +282,7 @@ all = [ "azure-ai-vision", "azure-cognitiveservices-speech", "momento", + "singlestoredb", "tigrisdb" ] diff --git a/tests/integration_tests/vectorstores/test_singlestoredb.py b/tests/integration_tests/vectorstores/test_singlestoredb.py new file mode 100644 index 00000000..87bfce82 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_singlestoredb.py @@ -0,0 +1,142 @@ +"""Test SingleStoreDB functionality.""" +from typing import List + +import numpy as np +import pytest + +from langchain.docstore.document import Document +from langchain.vectorstores.singlestoredb import SingleStoreDB +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + +TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db" +TEST_SINGLE_RESULT = [Document(page_content="foo")] +TEST_SINGLE_WITH_METADATA_RESULT = [Document(page_content="foo", metadata={"a": "b"})] +TEST_RESULT = [Document(page_content="foo"), Document(page_content="foo")] + +try: + import singlestoredb as s2 + + singlestoredb_installed = True +except ImportError: + singlestoredb_installed = False + + +def drop(table_name: str) -> None: + with s2.connect(TEST_SINGLESTOREDB_URL) as conn: + conn.autocommit(True) + with conn.cursor() as cursor: + cursor.execute(f"DROP TABLE IF EXISTS {table_name};") + + +class NormilizedFakeEmbeddings(FakeEmbeddings): + """Fake embeddings with normalization. For testing purposes.""" + + def normalize(self, vector: List[float]) -> List[float]: + """Normalize vector.""" + return [float(v / np.linalg.norm(vector)) for v in vector] + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + return [self.normalize(v) for v in super().embed_documents(texts)] + + def embed_query(self, text: str) -> List[float]: + return self.normalize(super().embed_query(text)) + + +@pytest.fixture +def texts() -> List[str]: + return ["foo", "bar", "baz"] + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb(texts: List[str]) -> None: + """Test end to end construction and search.""" + table_name = "test_singlestoredb" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + texts, + NormilizedFakeEmbeddings(), + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == TEST_SINGLE_RESULT + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_new_vector(texts: List[str]) -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_new_vector" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + texts, + NormilizedFakeEmbeddings(), + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + docsearch.add_texts(["foo"]) + output = docsearch.similarity_search("foo", k=2) + assert output == TEST_RESULT + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_from_existing(texts: List[str]) -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_from_existing" + drop(table_name) + SingleStoreDB.from_texts( + texts, + NormilizedFakeEmbeddings(), + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + # Test creating from an existing + docsearch2 = SingleStoreDB( + NormilizedFakeEmbeddings(), + table_name="test_singlestoredb_from_existing", + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch2.similarity_search("foo", k=1) + assert output == TEST_SINGLE_RESULT + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_from_documents(texts: List[str]) -> None: + """Test from_documents constructor.""" + table_name = "test_singlestoredb_from_documents" + drop(table_name) + docs = [Document(page_content=t, metadata={"a": "b"}) for t in texts] + docsearch = SingleStoreDB.from_documents( + docs, + NormilizedFakeEmbeddings(), + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == TEST_SINGLE_WITH_METADATA_RESULT + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_add_texts_to_existing(texts: List[str]) -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_add_texts_to_existing" + drop(table_name) + # Test creating from an existing + SingleStoreDB.from_texts( + texts, + NormilizedFakeEmbeddings(), + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + docsearch = SingleStoreDB( + NormilizedFakeEmbeddings(), + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + docsearch.add_texts(["foo"]) + output = docsearch.similarity_search("foo", k=2) + assert output == TEST_RESULT + drop(table_name)