From 0118706fd61985c9b0d5ad80d531a64fd61e5bf3 Mon Sep 17 00:00:00 2001 From: Naveen Tatikonda Date: Mon, 20 Feb 2023 20:39:34 -0600 Subject: [PATCH] Add Support for OpenSearch Vector database (#1191) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description This PR adds a wrapper which adds support for the OpenSearch vector database. Using opensearch-py client we are ingesting the embeddings of given text into opensearch cluster using Bulk API. We can perform the `similarity_search` on the index using the 3 popular searching methods of OpenSearch k-NN plugin: - `Approximate k-NN Search` use approximate nearest neighbor (ANN) algorithms from the [nmslib](https://github.com/nmslib/nmslib), [faiss](https://github.com/facebookresearch/faiss), and [Lucene](https://lucene.apache.org/) libraries to power k-NN search. - `Script Scoring` extends OpenSearch’s script scoring functionality to execute a brute force, exact k-NN search. - `Painless Scripting` adds the distance functions as painless extensions that can be used in more complex combinations. Also, supports brute force, exact k-NN search like Script Scoring. ### Issues Resolved https://github.com/hwchase17/langchain/issues/1054 --------- Signed-off-by: Naveen Tatikonda --- docs/ecosystem/opensearch.md | 21 + .../chain_examples/qa_with_sources.ipynb | 2 +- .../vector_db_qa_with_sources.ipynb | 2 +- .../vectorstore_examples/opensearch.ipynb | 220 ++++++++++ docs/reference/integrations.md | 4 + langchain/vectorstores/__init__.py | 2 + .../vectorstores/opensearch_vector_search.py | 382 ++++++++++++++++++ poetry.lock | 27 +- pyproject.toml | 3 +- .../vectorstores/test_opensearch.py | 128 ++++++ 10 files changed, 786 insertions(+), 5 deletions(-) create mode 100644 docs/ecosystem/opensearch.md create mode 100644 docs/modules/indexes/vectorstore_examples/opensearch.ipynb create mode 100644 langchain/vectorstores/opensearch_vector_search.py create mode 100644 tests/integration_tests/vectorstores/test_opensearch.py diff --git a/docs/ecosystem/opensearch.md b/docs/ecosystem/opensearch.md new file mode 100644 index 00000000..f1376fe7 --- /dev/null +++ b/docs/ecosystem/opensearch.md @@ -0,0 +1,21 @@ +# OpenSearch + +This page covers how to use the OpenSearch ecosystem within LangChain. +It is broken into two parts: installation and setup, and then references to specific OpenSearch wrappers. + +## Installation and Setup +- Install the Python package with `pip install opensearch-py` +## Wrappers + +### VectorStore + +There exists a wrapper around OpenSearch vector databases, allowing you to use it as a vectorstore +for semantic search using approximate vector search powered by lucene, nmslib and faiss engines +or using painless scripting and script scoring functions for bruteforce vector search. + +To import this vectorstore: +```python +from langchain.vectorstores import OpenSearchVectorSearch +``` + +For a more detailed walkthrough of the OpenSearch wrapper, see [this notebook](../modules/indexes/vectorstore_examples/opensearch.ipynb) diff --git a/docs/modules/indexes/chain_examples/qa_with_sources.ipynb b/docs/modules/indexes/chain_examples/qa_with_sources.ipynb index f9b97a5b..29c0d7c8 100644 --- a/docs/modules/indexes/chain_examples/qa_with_sources.ipynb +++ b/docs/modules/indexes/chain_examples/qa_with_sources.ipynb @@ -732,4 +732,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/modules/indexes/chain_examples/vector_db_qa_with_sources.ipynb b/docs/modules/indexes/chain_examples/vector_db_qa_with_sources.ipynb index daacb4ad..f212a340 100644 --- a/docs/modules/indexes/chain_examples/vector_db_qa_with_sources.ipynb +++ b/docs/modules/indexes/chain_examples/vector_db_qa_with_sources.ipynb @@ -215,4 +215,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/modules/indexes/vectorstore_examples/opensearch.ipynb b/docs/modules/indexes/vectorstore_examples/opensearch.ipynb new file mode 100644 index 00000000..226090a2 --- /dev/null +++ b/docs/modules/indexes/vectorstore_examples/opensearch.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "683953b3", + "metadata": {}, + "source": [ + "# OpenSearch\n", + "\n", + "This notebook shows how to use functionality related to the OpenSearch database.\n", + "\n", + "To run, you should have the opensearch instance up and running: [here](https://opensearch.org/docs/latest/install-and-configure/install-opensearch/index/)\n", + "`similarity_search` by default performs the Approximate k-NN Search which uses one of the several algorithms like lucene, nmslib, faiss recommended for\n", + "large datasets. To perform brute force search we have other search methods known as Script Scoring and Painless Scripting.\n", + "Check [this](https://opensearch.org/docs/latest/search-plugins/knn/index/) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "aac9563e", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import OpenSearchVectorSearch\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a3c3999a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "loader = TextLoader('../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\")\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(query)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print(docs[0].page_content)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### similarity_search using Approximate k-NN Search with Custom Parameters" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", engine=\"faiss\", space_type=\"innerproduct\", ef_construction=256, m=48)\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(query)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print(docs[0].page_content)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### similarity_search using Script Scoring with Custom Parameters" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", k=1, search_type=\"script_scoring\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print(docs[0].page_content)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### similarity_search using Painless Scripting with Custom Parameters" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "docsearch = OpenSearchVectorSearch.from_texts(texts, embeddings, opensearch_url=\"http://localhost:9200\", is_appx_search=False)\n", + "filter = {\"bool\": {\"filter\": {\"term\": {\"text\": \"smuggling\"}}}}\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(\"What did the president say about Ketanji Brown Jackson\", search_type=\"painless_scripting\", space_type=\"cosineSimilarity\", pre_filter=filter)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "print(docs[0].page_content)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/docs/reference/integrations.md b/docs/reference/integrations.md index 099d316b..8d7e6b2d 100644 --- a/docs/reference/integrations.md +++ b/docs/reference/integrations.md @@ -47,5 +47,9 @@ The following use cases require specific installs and api keys: - Install requirements with `pip install faiss` for Python 3.7 and `pip install faiss-cpu` for Python 3.10+. - _Manifest_: - Install requirements with `pip install manifest-ml` (Note: this is only available in Python 3.8+ currently). +- _OpenSearch_: + - Install requirements with `pip install opensearch-py` + - If you want to set up OpenSearch on your local, [here](https://opensearch.org/docs/latest/) + If you are using the `NLTKTextSplitter` or the `SpacyTextSplitter`, you will also need to install the appropriate models. For example, if you want to use the `SpacyTextSplitter`, you will need to install the `en_core_web_sm` model with `python -m spacy download en_core_web_sm`. Similarly, if you want to use the `NLTKTextSplitter`, you will need to install the `punkt` model with `python -m nltk.downloader punkt`. diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 43d819e5..d6c81f2a 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -4,6 +4,7 @@ from langchain.vectorstores.chroma import Chroma from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from langchain.vectorstores.faiss import FAISS from langchain.vectorstores.milvus import Milvus +from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch from langchain.vectorstores.pinecone import Pinecone from langchain.vectorstores.qdrant import Qdrant from langchain.vectorstores.weaviate import Weaviate @@ -17,4 +18,5 @@ __all__ = [ "Qdrant", "Milvus", "Chroma", + "OpenSearchVectorSearch", ] diff --git a/langchain/vectorstores/opensearch_vector_search.py b/langchain/vectorstores/opensearch_vector_search.py new file mode 100644 index 00000000..6e4d2545 --- /dev/null +++ b/langchain/vectorstores/opensearch_vector_search.py @@ -0,0 +1,382 @@ +"""Wrapper around OpenSearch vector database.""" +from __future__ import annotations + +import uuid +from typing import Any, Dict, Iterable, List, Optional + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.utils import get_from_dict_or_env +from langchain.vectorstores.base import VectorStore + +IMPORT_OPENSEARCH_PY_ERROR = ( + "Could not import OpenSearch. Please install it with `pip install opensearch-py`." +) +SCRIPT_SCORING_SEARCH = "script_scoring" +PAINLESS_SCRIPTING_SEARCH = "painless_scripting" +MATCH_ALL_QUERY = {"match_all": {}} # type: Dict + + +def _import_opensearch() -> Any: + """Import OpenSearch if available, otherwise raise error.""" + try: + from opensearchpy import OpenSearch + except ImportError: + raise ValueError(IMPORT_OPENSEARCH_PY_ERROR) + return OpenSearch + + +def _import_bulk() -> Any: + """Import bulk if available, otherwise raise error.""" + try: + from opensearchpy.helpers import bulk + except ImportError: + raise ValueError(IMPORT_OPENSEARCH_PY_ERROR) + return bulk + + +def _get_opensearch_client(opensearch_url: str) -> Any: + """Get OpenSearch client from the opensearch_url, otherwise raise error.""" + try: + opensearch = _import_opensearch() + client = opensearch(opensearch_url) + except ValueError as e: + raise ValueError( + f"OpenSearch client string provided is not in proper format. " + f"Got error: {e} " + ) + return client + + +def _validate_embeddings_and_bulk_size(embeddings_length: int, bulk_size: int) -> None: + """Validate Embeddings Length and Bulk Size.""" + if embeddings_length == 0: + raise RuntimeError("Embeddings size is zero") + if bulk_size < embeddings_length: + raise RuntimeError( + f"The embeddings count, {embeddings_length} is more than the " + f"[bulk_size], {bulk_size}. Increase the value of [bulk_size]." + ) + + +def _bulk_ingest_embeddings( + client: Any, + index_name: str, + embeddings: List[List[float]], + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, +) -> List[str]: + """Bulk Ingest Embeddings into given index.""" + bulk = _import_bulk() + requests = [] + ids = [] + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + _id = str(uuid.uuid4()) + request = { + "_op_type": "index", + "_index": index_name, + "vector_field": embeddings[i], + "text": text, + "metadata": metadata, + "_id": _id, + } + requests.append(request) + ids.append(_id) + bulk(client, requests) + client.indices.refresh(index=index_name) + return ids + + +def _default_scripting_text_mapping(dim: int) -> Dict: + """For Painless Scripting or Script Scoring,the default mapping to create index.""" + return { + "mappings": { + "properties": { + "vector_field": {"type": "knn_vector", "dimension": dim}, + } + } + } + + +def _default_text_mapping( + dim: int, + engine: str = "nmslib", + space_type: str = "l2", + ef_search: int = 512, + ef_construction: int = 512, + m: int = 16, +) -> Dict: + """For Approximate k-NN Search, this is the default mapping to create index.""" + return { + "settings": {"index": {"knn": True, "knn.algo_param.ef_search": ef_search}}, + "mappings": { + "properties": { + "vector_field": { + "type": "knn_vector", + "dimension": dim, + "method": { + "name": "hnsw", + "space_type": space_type, + "engine": engine, + "parameters": {"ef_construction": ef_construction, "m": m}, + }, + } + } + }, + } + + +def _default_approximate_search_query( + query_vector: List[float], size: int = 4, k: int = 4 +) -> Dict: + """For Approximate k-NN Search, this is the default query.""" + return { + "size": size, + "query": {"knn": {"vector_field": {"vector": query_vector, "k": k}}}, + } + + +def _default_script_query( + query_vector: List[float], + space_type: str = "l2", + pre_filter: Dict = MATCH_ALL_QUERY, +) -> Dict: + """For Script Scoring Search, this is the default query.""" + return { + "query": { + "script_score": { + "query": pre_filter, + "script": { + "source": "knn_score", + "lang": "knn", + "params": { + "field": "vector_field", + "query_value": query_vector, + "space_type": space_type, + }, + }, + } + } + } + + +def __get_painless_scripting_source(space_type: str, query_vector: List[float]) -> str: + """For Painless Scripting, it returns the script source based on space type.""" + source_value = ( + "(1.0 + " + space_type + "(" + str(query_vector) + ", doc['vector_field']))" + ) + if space_type == "cosineSimilarity": + return source_value + else: + return "1/" + source_value + + +def _default_painless_scripting_query( + query_vector: List[float], + space_type: str = "l2Squared", + pre_filter: Dict = MATCH_ALL_QUERY, +) -> Dict: + """For Painless Scripting Search, this is the default query.""" + source = __get_painless_scripting_source(space_type, query_vector) + return { + "query": { + "script_score": { + "query": pre_filter, + "script": { + "source": source, + "params": { + "field": "vector_field", + "query_value": query_vector, + }, + }, + } + } + } + + +def _get_kwargs_value(kwargs: Any, key: str, default_value: Any) -> Any: + """Get the value of the key if present. Else get the default_value.""" + if key in kwargs: + return kwargs.get(key) + return default_value + + +class OpenSearchVectorSearch(VectorStore): + """Wrapper around OpenSearch as a vector database. + + Example: + .. code-block:: python + + from langchain import OpenSearchVectorSearch + opensearch_vector_search = OpenSearchVectorSearch( + "http://localhost:9200", + "embeddings", + embedding_function + ) + + """ + + def __init__( + self, opensearch_url: str, index_name: str, embedding_function: Embeddings + ): + """Initialize with necessary components.""" + self.embedding_function = embedding_function + self.index_name = index_name + self.client = _get_opensearch_client(opensearch_url) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + bulk_size: int = 500, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + bulk_size: Bulk API request count; Default: 500 + + Returns: + List of ids from adding the texts into the vectorstore. + """ + embeddings = [ + self.embedding_function.embed_documents(list(text))[0] for text in texts + ] + _validate_embeddings_and_bulk_size(len(embeddings), bulk_size) + return _bulk_ingest_embeddings( + self.client, self.index_name, embeddings, texts, metadatas + ) + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query. + + By default supports Approximate Search. + Also supports Script Scoring and Painless Scripting. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + + Optional Args for Approximate Search: + search_type: "approximate_search"; default: "approximate_search" + size: number of results the query actually returns; default: 4 + + Optional Args for Script Scoring Search: + search_type: "script_scoring"; default: "approximate_search" + + space_type: "l2", "l1", "linf", "cosinesimil", "innerproduct", + "hammingbit"; default: "l2" + + pre_filter: script_score query to pre-filter documents before identifying + nearest neighbors; default: {"match_all": {}} + + Optional Args for Painless Scripting Search: + search_type: "painless_scripting"; default: "approximate_search" + space_type: "l2Squared", "l1Norm", "cosineSimilarity"; default: "l2Squared" + + pre_filter: script_score query to pre-filter documents before identifying + nearest neighbors; default: {"match_all": {}} + """ + embedding = self.embedding_function.embed_query(query) + search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search") + if search_type == "approximate_search": + size = _get_kwargs_value(kwargs, "size", 4) + search_query = _default_approximate_search_query(embedding, size, k) + elif search_type == SCRIPT_SCORING_SEARCH: + space_type = _get_kwargs_value(kwargs, "space_type", "l2") + pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY) + search_query = _default_script_query(embedding, space_type, pre_filter) + elif search_type == PAINLESS_SCRIPTING_SEARCH: + space_type = _get_kwargs_value(kwargs, "space_type", "l2Squared") + pre_filter = _get_kwargs_value(kwargs, "pre_filter", MATCH_ALL_QUERY) + search_query = _default_painless_scripting_query( + embedding, space_type, pre_filter + ) + else: + raise ValueError("Invalid `search_type` provided as an argument") + + response = self.client.search(index=self.index_name, body=search_query) + hits = [hit["_source"] for hit in response["hits"]["hits"][:k]] + documents = [ + Document(page_content=hit["text"], metadata=hit["metadata"]) for hit in hits + ] + return documents + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + bulk_size: int = 500, + **kwargs: Any, + ) -> OpenSearchVectorSearch: + """Construct OpenSearchVectorSearch wrapper from raw documents. + + Example: + .. code-block:: python + + from langchain import OpenSearchVectorSearch + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + opensearch_vector_search = OpenSearchVectorSearch.from_texts( + texts, + embeddings, + opensearch_url="http://localhost:9200" + ) + + OpenSearch by default supports Approximate Search powered by nmslib, faiss + and lucene engines recommended for large datasets. Also supports brute force + search through Script Scoring and Painless Scripting. + + Optional Keyword Args for Approximate Search: + engine: "nmslib", "faiss", "hnsw"; default: "nmslib" + + space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2" + + ef_search: Size of the dynamic list used during k-NN searches. Higher values + lead to more accurate but slower searches; default: 512 + + ef_construction: Size of the dynamic list used during k-NN graph creation. + Higher values lead to more accurate graph but slower indexing speed; + default: 512 + + m: Number of bidirectional links created for each new element. Large impact + on memory consumption. Between 2 and 100; default: 16 + + Keyword Args for Script Scoring or Painless Scripting: + is_appx_search: False + + """ + opensearch_url = get_from_dict_or_env( + kwargs, "opensearch_url", "OPENSEARCH_URL" + ) + client = _get_opensearch_client(opensearch_url) + embeddings = embedding.embed_documents(texts) + _validate_embeddings_and_bulk_size(len(embeddings), bulk_size) + dim = len(embeddings[0]) + index_name = uuid.uuid4().hex + is_appx_search = _get_kwargs_value(kwargs, "is_appx_search", True) + if is_appx_search: + engine = _get_kwargs_value(kwargs, "engine", "nmslib") + space_type = _get_kwargs_value(kwargs, "space_type", "l2") + ef_search = _get_kwargs_value(kwargs, "ef_search", 512) + ef_construction = _get_kwargs_value(kwargs, "ef_construction", 512) + m = _get_kwargs_value(kwargs, "m", 16) + + mapping = _default_text_mapping( + dim, engine, space_type, ef_search, ef_construction, m + ) + else: + mapping = _default_scripting_text_mapping(dim) + + client.indices.create(index=index_name, body=mapping) + _bulk_ingest_embeddings(client, index_name, embeddings, texts, metadatas) + return cls(opensearch_url, index_name, embedding) diff --git a/poetry.lock b/poetry.lock index 2726fcf9..d35ba0db 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3552,6 +3552,29 @@ dev = ["black (>=21.6b0,<22.0)", "pytest (>=6.0.0,<7.0.0)", "pytest-asyncio", "p embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "sklearn", "tenacity (>=8.0.1)"] wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"] +[[package]] +name = "opensearch-py" +version = "2.1.1" +description = "Python low-level client for OpenSearch" +category = "main" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" +files = [ + {file = "opensearch-py-2.1.1.tar.gz", hash = "sha256:dd54a50c6771bc2582741bfdcf629b8d7eed409ae7fc2722249e53f9a10de0d8"}, + {file = "opensearch_py-2.1.1-py2.py3-none-any.whl", hash = "sha256:3e7085bf25487979581416f4ab195c2fe62e90f1f07f393091f8233cbea032eb"}, +] + +[package.dependencies] +certifi = "*" +requests = ">=2.4.0,<3.0.0" +urllib3 = ">=1.21.1,<2" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +develop = ["black", "botocore", "coverage", "jinja2", "mock", "myst-parser", "pytest", "pytest-cov", "pyyaml", "requests (>=2.0.0,<3.0.0)", "sphinx", "sphinx-copybutton", "sphinx-rtd-theme"] +docs = ["myst-parser", "sphinx", "sphinx-copybutton", "sphinx-rtd-theme"] +kerberos = ["requests-kerberos"] + [[package]] name = "opt-einsum" version = "3.3.0" @@ -7039,10 +7062,10 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx"] llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "690fdd08a207a73cb343cfdf25f7ae7d4177dc39b704d8655f3a4f26a881c2fc" +content-hash = "7997201f64373247d8799baed84a5ad11ab3d92e26cc2114b26e734cfb9664a4" diff --git a/pyproject.toml b/pyproject.toml index 19b807c6..b675cfd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ numpy = "^1" faiss-cpu = {version = "^1", optional = true} wikipedia = {version = "^1", optional = true} elasticsearch = {version = "^8", optional = true} +opensearch-py = {version = "^2.0.0", optional = true} redis = {version = "^4", optional = true} manifest-ml = {version = "^0.0.1", optional = true} spacy = {version = "^3", optional = true} @@ -94,7 +95,7 @@ playwright = "^1.28.0" [tool.poetry.extras] llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx"] [tool.isort] profile = "black" diff --git a/tests/integration_tests/vectorstores/test_opensearch.py b/tests/integration_tests/vectorstores/test_opensearch.py new file mode 100644 index 00000000..efa1d9d7 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_opensearch.py @@ -0,0 +1,128 @@ +"""Test OpenSearch functionality.""" + +import pytest + +from langchain.docstore.document import Document +from langchain.vectorstores.opensearch_vector_search import ( + PAINLESS_SCRIPTING_SEARCH, + SCRIPT_SCORING_SEARCH, + OpenSearchVectorSearch, +) +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + +DEFAULT_OPENSEARCH_URL = "http://localhost:9200" +texts = ["foo", "bar", "baz"] + + +def test_opensearch() -> None: + """Test end to end indexing and search using Approximate Search.""" + docsearch = OpenSearchVectorSearch.from_texts( + texts, FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_opensearch_with_metadatas() -> None: + """Test end to end indexing and search with metadata.""" + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = OpenSearchVectorSearch.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + opensearch_url=DEFAULT_OPENSEARCH_URL, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + +def test_add_text() -> None: + """Test adding additional text elements to existing index.""" + text_input = ["test", "add", "text", "method"] + metadatas = [{"page": i} for i in range(len(text_input))] + docsearch = OpenSearchVectorSearch.from_texts( + texts, FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL + ) + docids = OpenSearchVectorSearch.add_texts(docsearch, text_input, metadatas) + assert len(docids) == len(text_input) + + +def test_opensearch_script_scoring() -> None: + """Test end to end indexing and search using Script Scoring Search.""" + pre_filter_val = {"bool": {"filter": {"term": {"text": "bar"}}}} + docsearch = OpenSearchVectorSearch.from_texts( + texts, + FakeEmbeddings(), + opensearch_url=DEFAULT_OPENSEARCH_URL, + is_appx_search=False, + ) + output = docsearch.similarity_search( + "foo", k=1, search_type=SCRIPT_SCORING_SEARCH, pre_filter=pre_filter_val + ) + assert output == [Document(page_content="bar")] + + +def test_add_text_script_scoring() -> None: + """Test adding additional text elements and validating using Script Scoring.""" + text_input = ["test", "add", "text", "method"] + metadatas = [{"page": i} for i in range(len(text_input))] + docsearch = OpenSearchVectorSearch.from_texts( + text_input, + FakeEmbeddings(), + opensearch_url=DEFAULT_OPENSEARCH_URL, + is_appx_search=False, + ) + OpenSearchVectorSearch.add_texts(docsearch, texts, metadatas) + output = docsearch.similarity_search( + "add", k=1, search_type=SCRIPT_SCORING_SEARCH, space_type="innerproduct" + ) + assert output == [Document(page_content="test")] + + +def test_opensearch_painless_scripting() -> None: + """Test end to end indexing and search using Painless Scripting Search.""" + pre_filter_val = {"bool": {"filter": {"term": {"text": "baz"}}}} + docsearch = OpenSearchVectorSearch.from_texts( + texts, + FakeEmbeddings(), + opensearch_url=DEFAULT_OPENSEARCH_URL, + is_appx_search=False, + ) + output = docsearch.similarity_search( + "foo", k=1, search_type=PAINLESS_SCRIPTING_SEARCH, pre_filter=pre_filter_val + ) + assert output == [Document(page_content="baz")] + + +def test_add_text_painless_scripting() -> None: + """Test adding additional text elements and validating using Painless Scripting.""" + text_input = ["test", "add", "text", "method"] + metadatas = [{"page": i} for i in range(len(text_input))] + docsearch = OpenSearchVectorSearch.from_texts( + text_input, + FakeEmbeddings(), + opensearch_url=DEFAULT_OPENSEARCH_URL, + is_appx_search=False, + ) + OpenSearchVectorSearch.add_texts(docsearch, texts, metadatas) + output = docsearch.similarity_search( + "add", k=1, search_type=PAINLESS_SCRIPTING_SEARCH, space_type="cosineSimilarity" + ) + assert output == [Document(page_content="test")] + + +def test_opensearch_invalid_search_type() -> None: + """Test to validate similarity_search by providing invalid search_type.""" + docsearch = OpenSearchVectorSearch.from_texts( + texts, FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL + ) + with pytest.raises(ValueError): + docsearch.similarity_search("foo", k=1, search_type="invalid_search_type") + + +def test_opensearch_embedding_size_zero() -> None: + """Test to validate indexing when embedding size is zero.""" + with pytest.raises(RuntimeError): + OpenSearchVectorSearch.from_texts( + [], FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL + )