From a61b7f7e7c76ae8667e40cd29cfe30a3868d7dd8 Mon Sep 17 00:00:00 2001 From: Paul-Emile Brotons <38215315+P-E-B@users.noreply.github.com> Date: Tue, 30 May 2023 16:59:01 +0200 Subject: [PATCH] adding MongoDBAtlasVectorSearch (#5338) # Add MongoDBAtlasVectorSearch for the python library Fixes #5337 --------- Co-authored-by: Dev 2049 --- .../mongodb_atlas_vector_search.ipynb | 170 +++++++++++ langchain/vectorstores/__init__.py | 2 + langchain/vectorstores/mongodb_atlas.py | 270 ++++++++++++++++++ poetry.lock | 6 +- pyproject.toml | 4 +- tests/integration_tests/.env.example | 6 +- .../vectorstores/test_mongodb_atlas.py | 135 +++++++++ 7 files changed, 588 insertions(+), 5 deletions(-) create mode 100644 docs/modules/indexes/vectorstores/examples/mongodb_atlas_vector_search.ipynb create mode 100644 langchain/vectorstores/mongodb_atlas.py create mode 100644 tests/integration_tests/vectorstores/test_mongodb_atlas.py diff --git a/docs/modules/indexes/vectorstores/examples/mongodb_atlas_vector_search.ipynb b/docs/modules/indexes/vectorstores/examples/mongodb_atlas_vector_search.ipynb new file mode 100644 index 00000000..4af84211 --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/mongodb_atlas_vector_search.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "683953b3", + "metadata": {}, + "source": [ + "# MongoDB Atlas Vector Search\n", + "\n", + ">[MongoDB Atlas](https://www.mongodb.com/docs/atlas/) is a document database managed in the cloud. It also enables Lucene and its vector search feature.\n", + "\n", + "This notebook shows how to use the functionality related to the `MongoDB Atlas Vector Search` feature where you can store your embeddings in MongoDB documents and create a Lucene vector index to perform a KNN search.\n", + "\n", + "It uses the [knnBeta Operator](https://www.mongodb.com/docs/atlas/atlas-search/knn-beta) available in MongoDB Atlas Search. This feature is in early access and available only for evaluation purposes, to validate functionality, and to gather feedback from a small closed group of early access users. It is not recommended for production deployments as we may introduce breaking changes.\n", + "\n", + "To use MongoDB Atlas, you must have first deployed a cluster. Free clusters are available. \n", + "Here is the MongoDB Atlas [quick start](https://www.mongodb.com/docs/atlas/getting-started/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4c41cad-08ef-4f72-a545-2151e4598efe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install pymongo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1e38361-c1fe-4ac6-86e9-c90ebaf7ae87", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "MONGODB_ATLAS_URI = os.environ['MONGODB_ATLAS_URI']" + ] + }, + { + "cell_type": "markdown", + "id": "320af802-9271-46ee-948f-d2453933d44b", + "metadata": {}, + "source": [ + "We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key. Make sure the environment variable `OPENAI_API_KEY` is set up before proceeding." + ] + }, + { + "cell_type": "markdown", + "id": "1f3ecc42", + "metadata": {}, + "source": [ + "Now, let's create a Lucene vector index on your cluster. In the below example, `embedding` is the name of the field that contains the embedding vector. Please refer to the [documentation](https://www.mongodb.com/docs/atlas/atlas-search/define-field-mappings-for-vector-search) to get more details on how to define an Atlas Search index.\n", + "You can name the index `langchain_demo` and create the index on the namespace `lanchain_db.langchain_col`. Finally, write the following definition in the JSON editor:\n", + "\n", + "```json\n", + "{\n", + " \"mappings\": {\n", + " \"dynamic\": true,\n", + " \"fields\": {\n", + " \"embedding\": {\n", + " \"dimensions\": 1536,\n", + " \"similarity\": \"cosine\",\n", + " \"type\": \"knnVector\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "aac9563e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import MongoDBAtlasVectorSearch\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a3c3999a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e104aee", + "metadata": {}, + "outputs": [], + "source": [ + "from pymongo import MongoClient\n", + "\n", + "# initialize MongoDB python client\n", + "client = MongoClient(MONGODB_ATLAS_CONNECTION_STRING)\n", + "\n", + "db_name = \"lanchain_db\"\n", + "collection_name = \"langchain_col\"\n", + "namespace = f\"{db_name}.{collection_name}\"\n", + "index_name = \"langchain_demo\"\n", + "\n", + "# insert the documents in MongoDB Atlas with their embedding\n", + "docsearch = MongoDBAtlasVectorSearch.from_documents(\n", + " docs,\n", + " embeddings,\n", + " client=client,\n", + " namespace=namespace,\n", + " index_name=index_name\n", + ")\n", + "\n", + "# perform a similarity search between the embedding of the query and the embeddings of the documents\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c608226", + "metadata": {}, + "outputs": [], + "source": [ + "print(docs[0].page_content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 23fe02ac..cef7d9fb 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -10,6 +10,7 @@ from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from langchain.vectorstores.faiss import FAISS from langchain.vectorstores.lancedb import LanceDB from langchain.vectorstores.milvus import Milvus +from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch from langchain.vectorstores.myscale import MyScale, MyScaleSettings from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch from langchain.vectorstores.pinecone import Pinecone @@ -38,6 +39,7 @@ __all__ = [ "AtlasDB", "DeepLake", "Annoy", + "MongoDBAtlasVectorSearch", "MyScale", "MyScaleSettings", "SKLearnVectorStore", diff --git a/langchain/vectorstores/mongodb_atlas.py b/langchain/vectorstores/mongodb_atlas.py new file mode 100644 index 00000000..3bd9abc3 --- /dev/null +++ b/langchain/vectorstores/mongodb_atlas.py @@ -0,0 +1,270 @@ +from __future__ import annotations + +import logging +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Generator, + Iterable, + List, + Optional, + Tuple, + Union, +) + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore + +if TYPE_CHECKING: + from pymongo import MongoClient + +logger = logging.getLogger(__name__) + +DEFAULT_INSERT_BATCH_SIZE = 100 + + +class MongoDBAtlasVectorSearch(VectorStore): + """Wrapper around MongoDB Atlas Vector Search. + + To use, you should have both: + - the ``pymongo`` python package installed + - a connection string associated with a MongoDB Atlas Cluster having deployed an + Atlas Search index + + Example: + .. code-block:: python + + from langchain.vectorstores import MongoDBAtlasVectorSearch + from langchain.embeddings.openai import OpenAIEmbeddings + from pymongo import MongoClient + + mongo_client = MongoClient("") + namespace = "." + embeddings = OpenAIEmbeddings() + vectorstore = MongoDBAtlasVectorSearch(mongo_client, namespace, embeddings) + """ + + def __init__( + self, + client: MongoClient, + namespace: str, + embedding: Embeddings, + *, + index_name: str = "default", + text_key: str = "text", + embedding_key: str = "embedding", + ): + """ + Args: + client: MongoDB client. + namespace: MongoDB namespace to add the texts to. + embedding: Text embedding model to use. + text_key: MongoDB field that will contain the text for each + document. + embedding_key: MongoDB field that will contain the embedding for + each document. + """ + self._client = client + db_name, collection_name = namespace.split(".") + self._collection = client[db_name][collection_name] + self._embedding = embedding + self._index_name = index_name + self._text_key = text_key + self._embedding_key = embedding_key + + @classmethod + def from_connection_string( + cls, + connection_string: str, + namespace: str, + embedding: Embeddings, + **kwargs: Any, + ) -> MongoDBAtlasVectorSearch: + try: + from pymongo import MongoClient + except ImportError: + raise ImportError( + "Could not import pymongo, please install it with " + "`pip install pymongo`." + ) + client: MongoClient = MongoClient(connection_string) + return cls(client, namespace, embedding, **kwargs) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[Dict[str, Any]]] = None, + **kwargs: Any, + ) -> List: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE) + _metadatas: Union[List, Generator] = metadatas or ({} for _ in texts) + texts_batch = [] + metadatas_batch = [] + result_ids = [] + for i, (text, metadata) in enumerate(zip(texts, _metadatas)): + texts_batch.append(text) + metadatas_batch.append(metadata) + if (i + 1) % batch_size == 0: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + texts_batch = [] + metadatas_batch = [] + if texts_batch: + result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) + return result_ids + + def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List: + if not texts: + return [] + # Embed and create the documents + embeddings = self._embedding.embed_documents(texts) + to_insert = [ + {self._text_key: t, self._embedding_key: embedding, **m} + for t, m, embedding in zip(texts, metadatas, embeddings) + ] + # insert the documents in MongoDB Atlas + insert_result = self._collection.insert_many(to_insert) + return insert_result.inserted_ids + + def similarity_search_with_score( + self, + query: str, + *, + k: int = 4, + pre_filter: Optional[dict] = None, + post_filter_pipeline: Optional[List[Dict]] = None, + ) -> List[Tuple[Document, float]]: + """Return MongoDB documents most similar to query, along with scores. + + Use the knnBeta Operator available in MongoDB Atlas Search + This feature is in early access and available only for evaluation purposes, to + validate functionality, and to gather feedback from a small closed group of + early access users. It is not recommended for production deployments as we + may introduce breaking changes. + For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta + + Args: + query: Text to look up documents similar to. + k: Optional Number of Documents to return. Defaults to 4. + pre_filter: Optional Dictionary of argument(s) to prefilter on document + fields. + post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages + following the knnBeta search. + + Returns: + List of Documents most similar to the query and score for each + """ + knn_beta = { + "vector": self._embedding.embed_query(query), + "path": self._embedding_key, + "k": k, + } + if pre_filter: + knn_beta["filter"] = pre_filter + pipeline = [ + { + "$search": { + "index": self._index_name, + "knnBeta": knn_beta, + } + }, + {"$project": {"score": {"$meta": "searchScore"}, self._embedding_key: 0}}, + ] + if post_filter_pipeline is not None: + pipeline.extend(post_filter_pipeline) + cursor = self._collection.aggregate(pipeline) + docs = [] + for res in cursor: + text = res.pop(self._text_key) + score = res.pop("score") + docs.append((Document(page_content=text, metadata=res), score)) + return docs + + def similarity_search( + self, + query: str, + k: int = 4, + pre_filter: Optional[dict] = None, + post_filter_pipeline: Optional[List[Dict]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return MongoDB documents most similar to query. + + Use the knnBeta Operator available in MongoDB Atlas Search + This feature is in early access and available only for evaluation purposes, to + validate functionality, and to gather feedback from a small closed group of + early access users. It is not recommended for production deployments as we may + introduce breaking changes. + For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta + + Args: + query: Text to look up documents similar to. + k: Optional Number of Documents to return. Defaults to 4. + pre_filter: Optional Dictionary of argument(s) to prefilter on document + fields. + post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages + following the knnBeta search. + + Returns: + List of Documents most similar to the query and score for each + """ + docs_and_scores = self.similarity_search_with_score( + query, + k=k, + pre_filter=pre_filter, + post_filter_pipeline=post_filter_pipeline, + ) + return [doc for doc, _ in docs_and_scores] + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + client: Optional[MongoClient] = None, + namespace: Optional[str] = None, + **kwargs: Any, + ) -> MongoDBAtlasVectorSearch: + """Construct MongoDBAtlasVectorSearch wrapper from raw documents. + + This is a user-friendly interface that: + 1. Embeds documents. + 2. Adds the documents to a provided MongoDB Atlas Vector Search index + (Lucene) + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + from pymongo import MongoClient + + from langchain.vectorstores import MongoDBAtlasVectorSearch + from langchain.embeddings import OpenAIEmbeddings + + client = MongoClient("") + namespace = "." + embeddings = OpenAIEmbeddings() + vectorstore = MongoDBAtlasVectorSearch.from_texts( + texts, + embeddings, + metadatas=metadatas, + client=client, + namespace=namespace + ) + """ + if not client or not namespace: + raise ValueError("Must provide 'client' and 'namespace' named parameters.") + vecstore = cls(client, namespace, embedding, **kwargs) + vecstore.add_texts(texts, metadatas=metadatas) + return vecstore diff --git a/poetry.lock b/poetry.lock index 633bc6cd..0887bcee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6965,7 +6965,7 @@ tests = ["duckdb", "polars[pandas,pyarrow]", "pytest"] name = "pymongo" version = "4.3.3" description = "Python driver for MongoDB " -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -10948,7 +10948,7 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "lxml", "manifest-ml", "momento", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "requests-toolbelt", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "lxml", "manifest-ml", "momento", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "requests-toolbelt", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] docarray = ["docarray"] @@ -10962,4 +10962,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "b3dc23f376de141d22b729d038144a1e6d66983a910160c3500fe0d79f8e5917" +content-hash = "937d2f0165f6aa381ea1e26002272a92b189ab18607bd05895e36d23f56978f4" diff --git a/pyproject.toml b/pyproject.toml index 950a614c..d2ff1a40 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ jinja2 = {version = "^3", optional = true} tiktoken = {version = "^0.3.2", optional = true, python="^3.9"} pinecone-client = {version = "^2", optional = true} pinecone-text = {version = "^0.4.2", optional = true} +pymongo = {version = "^4.3.3", optional = true} clickhouse-connect = {version="^0.5.14", optional=true} weaviate-client = {version = "^3", optional = true} google-api-python-client = {version = "2.70.0", optional = true} @@ -159,6 +160,7 @@ elasticsearch = {extras = ["async"], version = "^8.6.2"} redis = "^4.5.4" pinecone-client = "^2.2.1" pinecone-text = "^0.4.2" +pymongo = "^4.3.3" clickhouse-connect = "^0.5.14" pgvector = "^0.1.6" transformers = "^4.27.4" @@ -174,7 +176,6 @@ gptcache = "^0.1.9" promptlayer = "^0.1.80" tair = "^1.3.3" wikipedia = "^1" -pymongo = "^4.3.3" cassandra-driver = "^3.27.0" arxiv = "^1.4" mastodon-py = "^1.8.1" @@ -234,6 +235,7 @@ all = [ "jinja2", "pinecone-client", "pinecone-text", + "pymongo", "weaviate-client", "redis", "google-api-python-client", diff --git a/tests/integration_tests/.env.example b/tests/integration_tests/.env.example index 64153788..1e79818d 100644 --- a/tests/integration_tests/.env.example +++ b/tests/integration_tests/.env.example @@ -22,4 +22,8 @@ PINECONE_ENVIRONMENT=us-west4-gcp # details here https://learn.microsoft.com/en-us/dotnet/api/azure.identity.defaultazurecredential?view=azure-dotnet POWERBI_DATASET_ID=_powerbi_dataset_id_here POWERBI_TABLE_NAME=_test_table_name_here -POWERBI_NUMROWS=_num_rows_in_your_test_table \ No newline at end of file +POWERBI_NUMROWS=_num_rows_in_your_test_table + + +# MongoDB Atlas Vector Search +MONGODB_ATLAS_URI=your_mongodb_atlas_connection_string \ No newline at end of file diff --git a/tests/integration_tests/vectorstores/test_mongodb_atlas.py b/tests/integration_tests/vectorstores/test_mongodb_atlas.py new file mode 100644 index 00000000..715b5a30 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_mongodb_atlas.py @@ -0,0 +1,135 @@ +"""Test MongoDB Atlas Vector Search functionality.""" +from __future__ import annotations + +import os +from time import sleep +from typing import TYPE_CHECKING, Optional + +import pytest + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch + +if TYPE_CHECKING: + from pymongo import MongoClient + +INDEX_NAME = "langchain-test-index" +NAMESPACE = "langchain_test_db.langchain_test_collection" +CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI") +DB_NAME, COLLECTION_NAME = NAMESPACE.split(".") + + +def get_test_client() -> Optional[MongoClient]: + try: + from pymongo import MongoClient + + client: MongoClient = MongoClient(CONNECTION_STRING) + return client + except: # noqa: E722 + return None + + +# Instantiate as constant instead of pytest fixture to prevent needing to make multiple +# connections. +TEST_CLIENT = get_test_client() + + +class TestMongoDBAtlasVectorSearch: + @classmethod + def setup_class(cls) -> None: + # insure the test collection is empty + assert TEST_CLIENT[DB_NAME][COLLECTION_NAME].count_documents({}) == 0 # type: ignore[index] # noqa: E501 + + @classmethod + def teardown_class(cls) -> None: + # delete all the documents in the collection + TEST_CLIENT[DB_NAME][COLLECTION_NAME].delete_many({}) # type: ignore[index] + + @pytest.fixture(autouse=True) + def setup(self) -> None: + # delete all the documents in the collection + TEST_CLIENT[DB_NAME][COLLECTION_NAME].delete_many({}) # type: ignore[index] + + def test_from_documents(self, embedding_openai: Embeddings) -> None: + """Test end to end construction and search.""" + documents = [ + Document(page_content="Dogs are tough.", metadata={"a": 1}), + Document(page_content="Cats have fluff.", metadata={"b": 1}), + Document(page_content="What is a sandwich?", metadata={"c": 1}), + Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}), + ] + vectorstore = MongoDBAtlasVectorSearch.from_documents( + documents, + embedding_openai, + client=TEST_CLIENT, + namespace=NAMESPACE, + index_name=INDEX_NAME, + ) + sleep(1) # waits for mongot to update Lucene's index + output = vectorstore.similarity_search("Sandwich", k=1) + assert output[0].page_content == "What is a sandwich?" + assert output[0].metadata["c"] == 1 + + def test_from_texts(self, embedding_openai: Embeddings) -> None: + texts = [ + "Dogs are tough.", + "Cats have fluff.", + "What is a sandwich?", + "That fence is purple.", + ] + vectorstore = MongoDBAtlasVectorSearch.from_texts( + texts, + embedding_openai, + client=TEST_CLIENT, + namespace=NAMESPACE, + index_name=INDEX_NAME, + ) + sleep(1) # waits for mongot to update Lucene's index + output = vectorstore.similarity_search("Sandwich", k=1) + assert output[0].page_content == "What is a sandwich?" + + def test_from_texts_with_metadatas(self, embedding_openai: Embeddings) -> None: + texts = [ + "Dogs are tough.", + "Cats have fluff.", + "What is a sandwich?", + "The fence is purple.", + ] + metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}] + vectorstore = MongoDBAtlasVectorSearch.from_texts( + texts, + embedding_openai, + metadatas=metadatas, + client=TEST_CLIENT, + namespace=NAMESPACE, + index_name=INDEX_NAME, + ) + sleep(1) # waits for mongot to update Lucene's index + output = vectorstore.similarity_search("Sandwich", k=1) + assert output[0].page_content == "What is a sandwich?" + assert output[0].metadata["c"] == 1 + + def test_from_texts_with_metadatas_and_pre_filter( + self, embedding_openai: Embeddings + ) -> None: + texts = [ + "Dogs are tough.", + "Cats have fluff.", + "What is a sandwich?", + "The fence is purple.", + ] + metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}] + vectorstore = MongoDBAtlasVectorSearch.from_texts( + texts, + embedding_openai, + metadatas=metadatas, + client=TEST_CLIENT, + namespace=NAMESPACE, + index_name=INDEX_NAME, + ) + sleep(1) # waits for mongot to update Lucene's index + output = vectorstore.similarity_search( + "Sandwich", k=1, pre_filter={"range": {"lte": 0, "path": "c"}} + ) + assert output == []