langchain/libs/partners/mongodb/tests/unit_tests/test_vectorstores.py

from typing import Any, Optional

import pytest
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from pymongo.collection import Collection

from langchain_mongodb import MongoDBAtlasVectorSearch
from tests.utils import ConsistentFakeEmbeddings, MockCollection

INDEX_NAME = "langchain-test-index"
NAMESPACE = "langchain_test_db.langchain_test_collection"
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")


def get_collection() -> MockCollection:
    return MockCollection()


@pytest.fixture()
def collection() -> MockCollection:
    return get_collection()


@pytest.fixture(scope="module")
def embedding_openai() -> Embeddings:
    return ConsistentFakeEmbeddings()


def test_initialization(collection: Collection, embedding_openai: Embeddings) -> None:
    """Test initialization of vector store class"""
    assert MongoDBAtlasVectorSearch(collection, embedding_openai)


def test_init_from_texts(collection: Collection, embedding_openai: Embeddings) -> None:
    """Test from_texts operation on an empty list"""
    assert MongoDBAtlasVectorSearch.from_texts(
        [], embedding_openai, collection=collection
    )


class TestMongoDBAtlasVectorSearch:
    @classmethod
    def setup_class(cls) -> None:
        # ensure the test collection is empty
        collection = get_collection()
        assert collection.count_documents({}) == 0  # type: ignore[index]  # noqa: E501

    @classmethod
    def teardown_class(cls) -> None:
        collection = get_collection()
        # delete all the documents in the collection
        collection.delete_many({})  # type: ignore[index]

    @pytest.fixture(autouse=True)
    def setup(self) -> None:
        collection = get_collection()
        # delete all the documents in the collection
        collection.delete_many({})  # type: ignore[index]

    def _validate_search(
        self,
        vectorstore: MongoDBAtlasVectorSearch,
        collection: MockCollection,
        search_term: str = "sandwich",
        page_content: str = "What is a sandwich?",
        metadata: Optional[Any] = 1,
    ) -> None:
        collection._aggregate_result = list(
            filter(
                lambda x: search_term.lower() in x[vectorstore._text_key].lower(),
                collection._data,
            )
        )
        output = vectorstore.similarity_search("", k=1)
        assert output[0].page_content == page_content
        assert output[0].metadata.get("c") == metadata

    def test_from_documents(
        self, embedding_openai: Embeddings, collection: MockCollection
    ) -> None:
        """Test end to end construction and search."""
        documents = [
            Document(page_content="Dogs are tough.", metadata={"a": 1}),
            Document(page_content="Cats have fluff.", metadata={"b": 1}),
            Document(page_content="What is a sandwich?", metadata={"c": 1}),
            Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
        ]
        vectorstore = MongoDBAtlasVectorSearch.from_documents(
            documents,
            embedding_openai,
            collection=collection,
            index_name=INDEX_NAME,
        )
        self._validate_search(
            vectorstore, collection, metadata=documents[2].metadata["c"]
        )

    def test_from_texts(
        self, embedding_openai: Embeddings, collection: MockCollection
    ) -> None:
        texts = [
            "Dogs are tough.",
            "Cats have fluff.",
            "What is a sandwich?",
            "That fence is purple.",
        ]
        vectorstore = MongoDBAtlasVectorSearch.from_texts(
            texts,
            embedding_openai,
            collection=collection,
            index_name=INDEX_NAME,
        )
        self._validate_search(vectorstore, collection, metadata=None)

    def test_from_texts_with_metadatas(
        self, embedding_openai: Embeddings, collection: MockCollection
    ) -> None:
        texts = [
            "Dogs are tough.",
            "Cats have fluff.",
            "What is a sandwich?",
            "The fence is purple.",
        ]
        metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
        vectorstore = MongoDBAtlasVectorSearch.from_texts(
            texts,
            embedding_openai,
            metadatas=metadatas,
            collection=collection,
            index_name=INDEX_NAME,
        )
        self._validate_search(vectorstore, collection, metadata=metadatas[2]["c"])

    def test_from_texts_with_metadatas_and_pre_filter(
        self, embedding_openai: Embeddings, collection: MockCollection
    ) -> None:
        texts = [
            "Dogs are tough.",
            "Cats have fluff.",
            "What is a sandwich?",
            "The fence is purple.",
        ]
        metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
        vectorstore = MongoDBAtlasVectorSearch.from_texts(
            texts,
            embedding_openai,
            metadatas=metadatas,
            collection=collection,
            index_name=INDEX_NAME,
        )
        collection._aggregate_result = list(
            filter(
                lambda x: "sandwich" in x[vectorstore._text_key].lower()
                and x.get("c") < 0,
                collection._data,
            )
        )
        output = vectorstore.similarity_search(
            "Sandwich", k=1, pre_filter={"range": {"lte": 0, "path": "c"}}
        )
        assert output == []

    def test_mmr(
        self, embedding_openai: Embeddings, collection: MockCollection
    ) -> None:
        texts = ["foo", "foo", "fou", "foy"]
        vectorstore = MongoDBAtlasVectorSearch.from_texts(
            texts,
            embedding_openai,
            collection=collection,
            index_name=INDEX_NAME,
        )
        query = "foo"
        self._validate_search(
            vectorstore,
            collection,
            search_term=query[0:2],
            page_content=query,
            metadata=None,
        )
        output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1)
        assert len(output) == len(texts)
        assert output[0].page_content == "foo"
        assert output[1].page_content != "foo"
mongodb[minor]: Add MongoDB LLM Cache (#17470) # Description - Description: Adding MongoDB LLM Caching Layer abstraction - Issue: N/A - Dependencies: None - Twitter handle: @mongodb Checklist: - [x] PR title: Please title your PR "package: description", where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] PR Message (above) - [x] Pass lint and test: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified to check that you're passing lint and testing. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ - [ ] Add tests and docs: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @efriis, @eyurtsev, @hwchase17. --------- Co-authored-by: Jib <jib@byblack.us> 6 months ago			`from typing import Any, Optional`
mongodb[minor]: MongoDB Partner Package -- Porting MongoDBAtlasVectorSearch (#17652) This PR migrates the existing MongoDBAtlasVectorSearch abstraction from the `langchain_community` section to the partners package section of the codebase. - [x] Run the partner package script as advised in the partner-packages documentation. - [x] Add Unit Tests - [x] Migrate Integration Tests - [x] Refactor `MongoDBAtlasVectorStore` (autogenerated) to `MongoDBAtlasVectorSearch` - [x] ~Remove~ deprecate the old `langchain_community` VectorStore references. ## Additional Callouts - Implemented the `delete` method - Included any missing async function implementations - `amax_marginal_relevance_search_by_vector` - `adelete` - Added new Unit Tests that test for functionality of `MongoDBVectorSearch` methods - Removed [`del res[self._embedding_key]`](https://github.com/langchain-ai/langchain/blob/e0c81e1cb0ede673a69aae6434e17e34868c3bcc/libs/community/langchain_community/vectorstores/mongodb_atlas.py#L218) in `_similarity_search_with_score` function as it would make the `maximal_marginal_relevance` function fail otherwise. The `Document` needs to store the embedding key in metadata to work. Checklist: - [x] PR title: Please title your PR "package: description", where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] PR message - [x] Pass lint and test: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified to check that you're passing lint and testing. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ - [x] Add tests and docs: If you're adding a new integration, please include 1. Existing tests supplied in docs/docs do not change. Updated docstrings for new functions like `delete` 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. (This already exists) If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17. --------- Co-authored-by: Steven Silvester <steven.silvester@ieee.org> Co-authored-by: Erick Friis <erick@langchain.dev> 7 months ago
			`import pytest`
			`from langchain_core.documents import Document`
			`from langchain_core.embeddings import Embeddings`
			`from pymongo.collection import Collection`

			`from langchain_mongodb import MongoDBAtlasVectorSearch`
mongodb[minor]: Add MongoDB LLM Cache (#17470) # Description - Description: Adding MongoDB LLM Caching Layer abstraction - Issue: N/A - Dependencies: None - Twitter handle: @mongodb Checklist: - [x] PR title: Please title your PR "package: description", where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] PR Message (above) - [x] Pass lint and test: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified to check that you're passing lint and testing. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ - [ ] Add tests and docs: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @efriis, @eyurtsev, @hwchase17. --------- Co-authored-by: Jib <jib@byblack.us> 6 months ago			`from tests.utils import ConsistentFakeEmbeddings, MockCollection`
mongodb[minor]: MongoDB Partner Package -- Porting MongoDBAtlasVectorSearch (#17652) This PR migrates the existing MongoDBAtlasVectorSearch abstraction from the `langchain_community` section to the partners package section of the codebase. - [x] Run the partner package script as advised in the partner-packages documentation. - [x] Add Unit Tests - [x] Migrate Integration Tests - [x] Refactor `MongoDBAtlasVectorStore` (autogenerated) to `MongoDBAtlasVectorSearch` - [x] ~Remove~ deprecate the old `langchain_community` VectorStore references. ## Additional Callouts - Implemented the `delete` method - Included any missing async function implementations - `amax_marginal_relevance_search_by_vector` - `adelete` - Added new Unit Tests that test for functionality of `MongoDBVectorSearch` methods - Removed [`del res[self._embedding_key]`](https://github.com/langchain-ai/langchain/blob/e0c81e1cb0ede673a69aae6434e17e34868c3bcc/libs/community/langchain_community/vectorstores/mongodb_atlas.py#L218) in `_similarity_search_with_score` function as it would make the `maximal_marginal_relevance` function fail otherwise. The `Document` needs to store the embedding key in metadata to work. Checklist: - [x] PR title: Please title your PR "package: description", where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] PR message - [x] Pass lint and test: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified to check that you're passing lint and testing. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ - [x] Add tests and docs: If you're adding a new integration, please include 1. Existing tests supplied in docs/docs do not change. Updated docstrings for new functions like `delete` 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. (This already exists) If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17. --------- Co-authored-by: Steven Silvester <steven.silvester@ieee.org> Co-authored-by: Erick Friis <erick@langchain.dev> 7 months ago
			`INDEX_NAME = "langchain-test-index"`
			`NAMESPACE = "langchain_test_db.langchain_test_collection"`
			`DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")`


			`def get_collection() -> MockCollection:`
			`return MockCollection()`


			`@pytest.fixture()`
			`def collection() -> MockCollection:`
			`return get_collection()`


mongodb[minor]: Add MongoDB LLM Cache (#17470) # Description - Description: Adding MongoDB LLM Caching Layer abstraction - Issue: N/A - Dependencies: None - Twitter handle: @mongodb Checklist: - [x] PR title: Please title your PR "package: description", where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] PR Message (above) - [x] Pass lint and test: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified to check that you're passing lint and testing. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ - [ ] Add tests and docs: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @efriis, @eyurtsev, @hwchase17. --------- Co-authored-by: Jib <jib@byblack.us> 6 months ago			`@pytest.fixture(scope="module")`
mongodb[minor]: MongoDB Partner Package -- Porting MongoDBAtlasVectorSearch (#17652) This PR migrates the existing MongoDBAtlasVectorSearch abstraction from the `langchain_community` section to the partners package section of the codebase. - [x] Run the partner package script as advised in the partner-packages documentation. - [x] Add Unit Tests - [x] Migrate Integration Tests - [x] Refactor `MongoDBAtlasVectorStore` (autogenerated) to `MongoDBAtlasVectorSearch` - [x] ~Remove~ deprecate the old `langchain_community` VectorStore references. ## Additional Callouts - Implemented the `delete` method - Included any missing async function implementations - `amax_marginal_relevance_search_by_vector` - `adelete` - Added new Unit Tests that test for functionality of `MongoDBVectorSearch` methods - Removed [`del res[self._embedding_key]`](https://github.com/langchain-ai/langchain/blob/e0c81e1cb0ede673a69aae6434e17e34868c3bcc/libs/community/langchain_community/vectorstores/mongodb_atlas.py#L218) in `_similarity_search_with_score` function as it would make the `maximal_marginal_relevance` function fail otherwise. The `Document` needs to store the embedding key in metadata to work. Checklist: - [x] PR title: Please title your PR "package: description", where "package" is whichever of langchain, community, core, experimental, etc. is being modified. Use "docs: ..." for purely docs changes, "templates: ..." for template changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [x] PR message - [x] Pass lint and test: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified to check that you're passing lint and testing. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ - [x] Add tests and docs: If you're adding a new integration, please include 1. Existing tests supplied in docs/docs do not change. Updated docstrings for new functions like `delete` 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. (This already exists) If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17. --------- Co-authored-by: Steven Silvester <steven.silvester@ieee.org> Co-authored-by: Erick Friis <erick@langchain.dev> 7 months ago			`def embedding_openai() -> Embeddings:`
			`return ConsistentFakeEmbeddings()`


			`def test_initialization(collection: Collection, embedding_openai: Embeddings) -> None:`
			`"""Test initialization of vector store class"""`
			`assert MongoDBAtlasVectorSearch(collection, embedding_openai)`


			`def test_init_from_texts(collection: Collection, embedding_openai: Embeddings) -> None:`
			`"""Test from_texts operation on an empty list"""`
			`assert MongoDBAtlasVectorSearch.from_texts(`
			`[], embedding_openai, collection=collection`
			`)`


			`class TestMongoDBAtlasVectorSearch:`
			`@classmethod`
			`def setup_class(cls) -> None:`
			`# ensure the test collection is empty`
			`collection = get_collection()`
			`assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501`

			`@classmethod`
			`def teardown_class(cls) -> None:`
			`collection = get_collection()`
			`# delete all the documents in the collection`
			`collection.delete_many({}) # type: ignore[index]`

			`@pytest.fixture(autouse=True)`
			`def setup(self) -> None:`
			`collection = get_collection()`
			`# delete all the documents in the collection`
			`collection.delete_many({}) # type: ignore[index]`

			`def _validate_search(`
			`self,`
			`vectorstore: MongoDBAtlasVectorSearch,`
			`collection: MockCollection,`
			`search_term: str = "sandwich",`
			`page_content: str = "What is a sandwich?",`
			`metadata: Optional[Any] = 1,`
			`) -> None:`
			`collection._aggregate_result = list(`
			`filter(`
			`lambda x: search_term.lower() in x[vectorstore._text_key].lower(),`
			`collection._data,`
			`)`
			`)`
			`output = vectorstore.similarity_search("", k=1)`
			`assert output[0].page_content == page_content`
			`assert output[0].metadata.get("c") == metadata`

			`def test_from_documents(`
			`self, embedding_openai: Embeddings, collection: MockCollection`
			`) -> None:`
			`"""Test end to end construction and search."""`
			`documents = [`
			`Document(page_content="Dogs are tough.", metadata={"a": 1}),`
			`Document(page_content="Cats have fluff.", metadata={"b": 1}),`
			`Document(page_content="What is a sandwich?", metadata={"c": 1}),`
			`Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),`
			`]`
			`vectorstore = MongoDBAtlasVectorSearch.from_documents(`
			`documents,`
			`embedding_openai,`
			`collection=collection,`
			`index_name=INDEX_NAME,`
			`)`
			`self._validate_search(`
			`vectorstore, collection, metadata=documents[2].metadata["c"]`
			`)`

			`def test_from_texts(`
			`self, embedding_openai: Embeddings, collection: MockCollection`
			`) -> None:`
			`texts = [`
			`"Dogs are tough.",`
			`"Cats have fluff.",`
			`"What is a sandwich?",`
			`"That fence is purple.",`
			`]`
			`vectorstore = MongoDBAtlasVectorSearch.from_texts(`
			`texts,`
			`embedding_openai,`
			`collection=collection,`
			`index_name=INDEX_NAME,`
			`)`
			`self._validate_search(vectorstore, collection, metadata=None)`

			`def test_from_texts_with_metadatas(`
			`self, embedding_openai: Embeddings, collection: MockCollection`
			`) -> None:`
			`texts = [`
			`"Dogs are tough.",`
			`"Cats have fluff.",`
			`"What is a sandwich?",`
			`"The fence is purple.",`
			`]`
			`metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]`
			`vectorstore = MongoDBAtlasVectorSearch.from_texts(`
			`texts,`
			`embedding_openai,`
			`metadatas=metadatas,`
			`collection=collection,`
			`index_name=INDEX_NAME,`
			`)`
			`self._validate_search(vectorstore, collection, metadata=metadatas[2]["c"])`

			`def test_from_texts_with_metadatas_and_pre_filter(`
			`self, embedding_openai: Embeddings, collection: MockCollection`
			`) -> None:`
			`texts = [`
			`"Dogs are tough.",`
			`"Cats have fluff.",`
			`"What is a sandwich?",`
			`"The fence is purple.",`
			`]`
			`metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]`
			`vectorstore = MongoDBAtlasVectorSearch.from_texts(`
			`texts,`
			`embedding_openai,`
			`metadatas=metadatas,`
			`collection=collection,`
			`index_name=INDEX_NAME,`
			`)`
			`collection._aggregate_result = list(`
			`filter(`
			`lambda x: "sandwich" in x[vectorstore._text_key].lower()`
			`and x.get("c") < 0,`
			`collection._data,`
			`)`
			`)`
			`output = vectorstore.similarity_search(`
			`"Sandwich", k=1, pre_filter={"range": {"lte": 0, "path": "c"}}`
			`)`
			`assert output == []`

			`def test_mmr(`
			`self, embedding_openai: Embeddings, collection: MockCollection`
			`) -> None:`
			`texts = ["foo", "foo", "fou", "foy"]`
			`vectorstore = MongoDBAtlasVectorSearch.from_texts(`
			`texts,`
			`embedding_openai,`
			`collection=collection,`
			`index_name=INDEX_NAME,`
			`)`
			`query = "foo"`
			`self._validate_search(`
			`vectorstore,`
			`collection,`
			`search_term=query[0:2],`
			`page_content=query,`
			`metadata=None,`
			`)`
			`output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1)`
			`assert len(output) == len(texts)`
			`assert output[0].page_content == "foo"`
			`assert output[1].page_content != "foo"`