langchain/libs/community/tests/unit_tests/retrievers/test_ensemble.py

import pytest
from langchain.retrievers.ensemble import EnsembleRetriever
from langchain_core.documents import Document
from langchain_core.embeddings import FakeEmbeddings


@pytest.mark.requires("rank_bm25")
def test_ensemble_retriever_get_relevant_docs() -> None:
    doc_list = [
        "I like apples",
        "I like oranges",
        "Apples and oranges are fruits",
    ]

    from langchain_community.retrievers import BM25Retriever

    dummy_retriever = BM25Retriever.from_texts(doc_list)
    dummy_retriever.k = 1

    ensemble_retriever = EnsembleRetriever(  # type: ignore[call-arg]
        retrievers=[dummy_retriever, dummy_retriever]
    )
    docs = ensemble_retriever.invoke("I like apples")
    assert len(docs) == 1


@pytest.mark.requires("rank_bm25")
def test_weighted_reciprocal_rank() -> None:
    doc1 = Document(page_content="1")
    doc2 = Document(page_content="2")

    from langchain_community.retrievers import BM25Retriever

    dummy_retriever = BM25Retriever.from_texts(["1", "2"])
    ensemble_retriever = EnsembleRetriever(
        retrievers=[dummy_retriever, dummy_retriever], weights=[0.4, 0.5], c=0
    )
    result = ensemble_retriever.weighted_reciprocal_rank([[doc1, doc2], [doc2, doc1]])
    assert result[0].page_content == "2"
    assert result[1].page_content == "1"

    ensemble_retriever.weights = [0.5, 0.4]
    result = ensemble_retriever.weighted_reciprocal_rank([[doc1, doc2], [doc2, doc1]])
    assert result[0].page_content == "1"
    assert result[1].page_content == "2"


@pytest.mark.requires("rank_bm25", "sklearn")
def test_ensemble_retriever_get_relevant_docs_with_multiple_retrievers() -> None:
    doc_list_a = [
        "I like apples",
        "I like oranges",
        "Apples and oranges are fruits",
    ]
    doc_list_b = [
        "I like melons",
        "I like pineapples",
        "Melons and pineapples are fruits",
    ]
    doc_list_c = [
        "I like avocados",
        "I like strawberries",
        "Avocados and strawberries are fruits",
    ]

    from langchain_community.retrievers import (
        BM25Retriever,
        KNNRetriever,
        TFIDFRetriever,
    )

    dummy_retriever = BM25Retriever.from_texts(doc_list_a)
    dummy_retriever.k = 1
    tfidf_retriever = TFIDFRetriever.from_texts(texts=doc_list_b)
    tfidf_retriever.k = 1
    knn_retriever = KNNRetriever.from_texts(
        texts=doc_list_c, embeddings=FakeEmbeddings(size=100)
    )
    knn_retriever.k = 1

    ensemble_retriever = EnsembleRetriever(
        retrievers=[dummy_retriever, tfidf_retriever, knn_retriever],
        weights=[0.6, 0.3, 0.1],
    )
    docs = ensemble_retriever.invoke("I like apples")
    assert len(docs) == 3
add Hybrid retriever that not require any external service (#8108) - Until now, hybrid search was limited to modules requiring external services, such as Weaviate/Pinecone Hybrid Search. However, I have developed a hybrid retriever that can merge a list of retrievers using the [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) algorithm. This new approach, similar to Weaviate hybrid search, does not require the initialization of any external service. - Dependencies: No - Twitter handle: dayuanjian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-25 02:16:10 +00:00			`import pytest`
langchain[patch],community[patch]: Move unit tests that depend on community to community (#21685) 2024-05-16 21:24:27 +00:00			`from langchain.retrievers.ensemble import EnsembleRetriever`
REFACTOR: Refactor langchain_core (#13627) Changes: - remove langchain_core/schema since no clear distinction b/n schema and non-schema modules - make every module that doesn't end in -y plural - where easy have 1-2 classes per file - no more than one level of nesting in directories - only import from top level core modules in langchain 2023-11-21 16:35:29 +00:00			`from langchain_core.documents import Document`
multiple: langchain 0.2 in master (#21191) 0.2rc migrations - [x] Move memory - [x] Move remaining retrievers - [x] graph_qa chains - [x] some dependency from evaluation code potentially on math utils - [x] Move openapi chain from `langchain.chains.api.openapi` to `langchain_community.chains.openapi` - [x] Migrate `langchain.chains.ernie_functions` to `langchain_community.chains.ernie_functions` - [x] migrate `langchain/chains/llm_requests.py` to `langchain_community.chains.llm_requests` - [x] Moving `langchain_community.cross_enoders.base:BaseCrossEncoder` -> `langchain_community.retrievers.document_compressors.cross_encoder:BaseCrossEncoder` (namespace not ideal, but it needs to be moved to `langchain` to avoid circular deps) - [x] unit tests langchain -- add pytest.mark.community to some unit tests that will stay in langchain - [x] unit tests community -- move unit tests that depend on community to community - [x] mv integration tests that depend on community to community - [x] mypy checks Other todo - [x] Make deprecation warnings not noisy (need to use warn deprecated and check that things are implemented properly) - [x] Update deprecation messages with timeline for code removal (likely we actually won't be removing things until 0.4 release) -- will give people more time to transition their code. - [ ] Add information to deprecation warning to show users how to migrate their code base using langchain-cli - [ ] Remove any unnecessary requirements in langchain (e.g., is SQLALchemy required?) --------- Co-authored-by: Erick Friis <erick@langchain.dev> 2024-05-08 20:46:52 +00:00			`from langchain_core.embeddings import FakeEmbeddings`
add Hybrid retriever that not require any external service (#8108) - Until now, hybrid search was limited to modules requiring external services, such as Weaviate/Pinecone Hybrid Search. However, I have developed a hybrid retriever that can merge a list of retrievers using the [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) algorithm. This new approach, similar to Weaviate hybrid search, does not require the initialization of any external service. - Dependencies: No - Twitter handle: dayuanjian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-25 02:16:10 +00:00

			`@pytest.mark.requires("rank_bm25")`
			`def test_ensemble_retriever_get_relevant_docs() -> None:`
			`doc_list = [`
			`"I like apples",`
			`"I like oranges",`
			`"Apples and oranges are fruits",`
			`]`

multiple: langchain 0.2 in master (#21191) 0.2rc migrations - [x] Move memory - [x] Move remaining retrievers - [x] graph_qa chains - [x] some dependency from evaluation code potentially on math utils - [x] Move openapi chain from `langchain.chains.api.openapi` to `langchain_community.chains.openapi` - [x] Migrate `langchain.chains.ernie_functions` to `langchain_community.chains.ernie_functions` - [x] migrate `langchain/chains/llm_requests.py` to `langchain_community.chains.llm_requests` - [x] Moving `langchain_community.cross_enoders.base:BaseCrossEncoder` -> `langchain_community.retrievers.document_compressors.cross_encoder:BaseCrossEncoder` (namespace not ideal, but it needs to be moved to `langchain` to avoid circular deps) - [x] unit tests langchain -- add pytest.mark.community to some unit tests that will stay in langchain - [x] unit tests community -- move unit tests that depend on community to community - [x] mv integration tests that depend on community to community - [x] mypy checks Other todo - [x] Make deprecation warnings not noisy (need to use warn deprecated and check that things are implemented properly) - [x] Update deprecation messages with timeline for code removal (likely we actually won't be removing things until 0.4 release) -- will give people more time to transition their code. - [ ] Add information to deprecation warning to show users how to migrate their code base using langchain-cli - [ ] Remove any unnecessary requirements in langchain (e.g., is SQLALchemy required?) --------- Co-authored-by: Erick Friis <erick@langchain.dev> 2024-05-08 20:46:52 +00:00			`from langchain_community.retrievers import BM25Retriever`

add Hybrid retriever that not require any external service (#8108) - Until now, hybrid search was limited to modules requiring external services, such as Weaviate/Pinecone Hybrid Search. However, I have developed a hybrid retriever that can merge a list of retrievers using the [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) algorithm. This new approach, similar to Weaviate hybrid search, does not require the initialization of any external service. - Dependencies: No - Twitter handle: dayuanjian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-25 02:16:10 +00:00			`dummy_retriever = BM25Retriever.from_texts(doc_list)`
			`dummy_retriever.k = 1`

langchain: upgrade mypy (#19163) Update mypy in langchain 2024-03-15 20:37:09 +00:00			`ensemble_retriever = EnsembleRetriever( # type: ignore[call-arg]`
add Hybrid retriever that not require any external service (#8108) - Until now, hybrid search was limited to modules requiring external services, such as Weaviate/Pinecone Hybrid Search. However, I have developed a hybrid retriever that can merge a list of retrievers using the [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) algorithm. This new approach, similar to Weaviate hybrid search, does not require the initialization of any external service. - Dependencies: No - Twitter handle: dayuanjian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-25 02:16:10 +00:00			`retrievers=[dummy_retriever, dummy_retriever]`
			`)`
patch: deprecate (a)get_relevant_documents (#20477) - `.get_relevant_documents(query)` -> `.invoke(query)` - `.get_relevant_documents(query=query)` -> `.invoke(query)` - `.get_relevant_documents(query, callbacks=callbacks)` -> `.invoke(query, config={"callbacks": callbacks})` - `.get_relevant_documents(query, kwargs)` -> `.invoke(query, kwargs)` --------- Co-authored-by: Erick Friis <erick@langchain.dev> 2024-04-22 15:14:53 +00:00			`docs = ensemble_retriever.invoke("I like apples")`
add Hybrid retriever that not require any external service (#8108) - Until now, hybrid search was limited to modules requiring external services, such as Weaviate/Pinecone Hybrid Search. However, I have developed a hybrid retriever that can merge a list of retrievers using the [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) algorithm. This new approach, similar to Weaviate hybrid search, does not require the initialization of any external service. - Dependencies: No - Twitter handle: dayuanjian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-25 02:16:10 +00:00			`assert len(docs) == 1`


			`@pytest.mark.requires("rank_bm25")`
			`def test_weighted_reciprocal_rank() -> None:`
			`doc1 = Document(page_content="1")`
			`doc2 = Document(page_content="2")`

multiple: langchain 0.2 in master (#21191) 0.2rc migrations - [x] Move memory - [x] Move remaining retrievers - [x] graph_qa chains - [x] some dependency from evaluation code potentially on math utils - [x] Move openapi chain from `langchain.chains.api.openapi` to `langchain_community.chains.openapi` - [x] Migrate `langchain.chains.ernie_functions` to `langchain_community.chains.ernie_functions` - [x] migrate `langchain/chains/llm_requests.py` to `langchain_community.chains.llm_requests` - [x] Moving `langchain_community.cross_enoders.base:BaseCrossEncoder` -> `langchain_community.retrievers.document_compressors.cross_encoder:BaseCrossEncoder` (namespace not ideal, but it needs to be moved to `langchain` to avoid circular deps) - [x] unit tests langchain -- add pytest.mark.community to some unit tests that will stay in langchain - [x] unit tests community -- move unit tests that depend on community to community - [x] mv integration tests that depend on community to community - [x] mypy checks Other todo - [x] Make deprecation warnings not noisy (need to use warn deprecated and check that things are implemented properly) - [x] Update deprecation messages with timeline for code removal (likely we actually won't be removing things until 0.4 release) -- will give people more time to transition their code. - [ ] Add information to deprecation warning to show users how to migrate their code base using langchain-cli - [ ] Remove any unnecessary requirements in langchain (e.g., is SQLALchemy required?) --------- Co-authored-by: Erick Friis <erick@langchain.dev> 2024-05-08 20:46:52 +00:00			`from langchain_community.retrievers import BM25Retriever`

add Hybrid retriever that not require any external service (#8108) - Until now, hybrid search was limited to modules requiring external services, such as Weaviate/Pinecone Hybrid Search. However, I have developed a hybrid retriever that can merge a list of retrievers using the [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) algorithm. This new approach, similar to Weaviate hybrid search, does not require the initialization of any external service. - Dependencies: No - Twitter handle: dayuanjian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-25 02:16:10 +00:00			`dummy_retriever = BM25Retriever.from_texts(["1", "2"])`
			`ensemble_retriever = EnsembleRetriever(`
			`retrievers=[dummy_retriever, dummy_retriever], weights=[0.4, 0.5], c=0`
			`)`
			`result = ensemble_retriever.weighted_reciprocal_rank([[doc1, doc2], [doc2, doc1]])`
			`assert result[0].page_content == "2"`
			`assert result[1].page_content == "1"`

			`ensemble_retriever.weights = [0.5, 0.4]`
			`result = ensemble_retriever.weighted_reciprocal_rank([[doc1, doc2], [doc2, doc1]])`
			`assert result[0].page_content == "1"`
			`assert result[1].page_content == "2"`
infra: add test for ensemble retriever to ensure multiple retrievers (#8401) Add tests to ensemble retriever to ensure it works with combination of multiple retrievers --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-02-14 05:22:03 +00:00

			`@pytest.mark.requires("rank_bm25", "sklearn")`
			`def test_ensemble_retriever_get_relevant_docs_with_multiple_retrievers() -> None:`
			`doc_list_a = [`
			`"I like apples",`
			`"I like oranges",`
			`"Apples and oranges are fruits",`
			`]`
			`doc_list_b = [`
			`"I like melons",`
			`"I like pineapples",`
			`"Melons and pineapples are fruits",`
			`]`
			`doc_list_c = [`
			`"I like avocados",`
			`"I like strawberries",`
			`"Avocados and strawberries are fruits",`
			`]`

multiple: langchain 0.2 in master (#21191) 0.2rc migrations - [x] Move memory - [x] Move remaining retrievers - [x] graph_qa chains - [x] some dependency from evaluation code potentially on math utils - [x] Move openapi chain from `langchain.chains.api.openapi` to `langchain_community.chains.openapi` - [x] Migrate `langchain.chains.ernie_functions` to `langchain_community.chains.ernie_functions` - [x] migrate `langchain/chains/llm_requests.py` to `langchain_community.chains.llm_requests` - [x] Moving `langchain_community.cross_enoders.base:BaseCrossEncoder` -> `langchain_community.retrievers.document_compressors.cross_encoder:BaseCrossEncoder` (namespace not ideal, but it needs to be moved to `langchain` to avoid circular deps) - [x] unit tests langchain -- add pytest.mark.community to some unit tests that will stay in langchain - [x] unit tests community -- move unit tests that depend on community to community - [x] mv integration tests that depend on community to community - [x] mypy checks Other todo - [x] Make deprecation warnings not noisy (need to use warn deprecated and check that things are implemented properly) - [x] Update deprecation messages with timeline for code removal (likely we actually won't be removing things until 0.4 release) -- will give people more time to transition their code. - [ ] Add information to deprecation warning to show users how to migrate their code base using langchain-cli - [ ] Remove any unnecessary requirements in langchain (e.g., is SQLALchemy required?) --------- Co-authored-by: Erick Friis <erick@langchain.dev> 2024-05-08 20:46:52 +00:00			`from langchain_community.retrievers import (`
			`BM25Retriever,`
			`KNNRetriever,`
			`TFIDFRetriever,`
			`)`

infra: add test for ensemble retriever to ensure multiple retrievers (#8401) Add tests to ensemble retriever to ensure it works with combination of multiple retrievers --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-02-14 05:22:03 +00:00			`dummy_retriever = BM25Retriever.from_texts(doc_list_a)`
			`dummy_retriever.k = 1`
			`tfidf_retriever = TFIDFRetriever.from_texts(texts=doc_list_b)`
			`tfidf_retriever.k = 1`
			`knn_retriever = KNNRetriever.from_texts(`
			`texts=doc_list_c, embeddings=FakeEmbeddings(size=100)`
			`)`
			`knn_retriever.k = 1`

			`ensemble_retriever = EnsembleRetriever(`
			`retrievers=[dummy_retriever, tfidf_retriever, knn_retriever],`
			`weights=[0.6, 0.3, 0.1],`
			`)`
patch: deprecate (a)get_relevant_documents (#20477) - `.get_relevant_documents(query)` -> `.invoke(query)` - `.get_relevant_documents(query=query)` -> `.invoke(query)` - `.get_relevant_documents(query, callbacks=callbacks)` -> `.invoke(query, config={"callbacks": callbacks})` - `.get_relevant_documents(query, kwargs)` -> `.invoke(query, kwargs)` --------- Co-authored-by: Erick Friis <erick@langchain.dev> 2024-04-22 15:14:53 +00:00			`docs = ensemble_retriever.invoke("I like apples")`
infra: add test for ensemble retriever to ensure multiple retrievers (#8401) Add tests to ensemble retriever to ensure it works with combination of multiple retrievers --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-02-14 05:22:03 +00:00			`assert len(docs) == 3`