langchain/libs/community/tests/unit_tests/retrievers/test_bm25.py

import pytest
from langchain_core.documents import Document

from langchain_community.retrievers.bm25 import BM25Retriever


@pytest.mark.requires("rank_bm25")
def test_from_texts() -> None:
    input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
    bm25_retriever = BM25Retriever.from_texts(texts=input_texts)
    assert len(bm25_retriever.docs) == 3
    assert bm25_retriever.vectorizer.doc_len == [4, 5, 4]


@pytest.mark.requires("rank_bm25")
def test_from_texts_with_bm25_params() -> None:
    input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
    bm25_retriever = BM25Retriever.from_texts(
        texts=input_texts, bm25_params={"epsilon": 10}
    )
    # should count only multiple words (have, pan)
    assert bm25_retriever.vectorizer.epsilon == 10


@pytest.mark.requires("rank_bm25")
def test_from_documents() -> None:
    input_docs = [
        Document(page_content="I have a pen."),
        Document(page_content="Do you have a pen?"),
        Document(page_content="I have a bag."),
    ]
    bm25_retriever = BM25Retriever.from_documents(documents=input_docs)
    assert len(bm25_retriever.docs) == 3
    assert bm25_retriever.vectorizer.doc_len == [4, 5, 4]


@pytest.mark.requires("rank_bm25")
def test_repr() -> None:
    input_docs = [
        Document(page_content="I have a pen."),
        Document(page_content="Do you have a pen?"),
        Document(page_content="I have a bag."),
    ]
    bm25_retriever = BM25Retriever.from_documents(documents=input_docs)
    assert "I have a pen" not in repr(bm25_retriever)
add bm25 module (#7779) - Description: Add a BM25 Retriever that do not need Elastic search - Dependencies: rank_bm25(if it is not installed it will be install by using pip, just like TFIDFRetriever do) - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: DayuanJian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-17 14:30:17 +00:00			`import pytest`
REFACTOR: Refactor langchain_core (#13627) Changes: - remove langchain_core/schema since no clear distinction b/n schema and non-schema modules - make every module that doesn't end in -y plural - where easy have 1-2 classes per file - no more than one level of nesting in directories - only import from top level core modules in langchain 2023-11-21 16:35:29 +00:00			`from langchain_core.documents import Document`
add bm25 module (#7779) - Description: Add a BM25 Retriever that do not need Elastic search - Dependencies: rank_bm25(if it is not installed it will be install by using pip, just like TFIDFRetriever do) - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: DayuanJian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-17 14:30:17 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`from langchain_community.retrievers.bm25 import BM25Retriever`
add bm25 module (#7779) - Description: Add a BM25 Retriever that do not need Elastic search - Dependencies: rank_bm25(if it is not installed it will be install by using pip, just like TFIDFRetriever do) - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: DayuanJian21687 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-07-17 14:30:17 +00:00

			`@pytest.mark.requires("rank_bm25")`
			`def test_from_texts() -> None:`
			`input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]`
			`bm25_retriever = BM25Retriever.from_texts(texts=input_texts)`
			`assert len(bm25_retriever.docs) == 3`
			`assert bm25_retriever.vectorizer.doc_len == [4, 5, 4]`


			`@pytest.mark.requires("rank_bm25")`
			`def test_from_texts_with_bm25_params() -> None:`
			`input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]`
			`bm25_retriever = BM25Retriever.from_texts(`
			`texts=input_texts, bm25_params={"epsilon": 10}`
			`)`
			`# should count only multiple words (have, pan)`
			`assert bm25_retriever.vectorizer.epsilon == 10`


			`@pytest.mark.requires("rank_bm25")`
			`def test_from_documents() -> None:`
			`input_docs = [`
			`Document(page_content="I have a pen."),`
			`Document(page_content="Do you have a pen?"),`
			`Document(page_content="I have a bag."),`
			`]`
			`bm25_retriever = BM25Retriever.from_documents(documents=input_docs)`
			`assert len(bm25_retriever.docs) == 3`
			`assert bm25_retriever.vectorizer.doc_len == [4, 5, 4]`
Community[Patch] Remove docs form bm25 repr (#16110) Resolves: https://github.com/langchain-ai/langsmith-sdk/issues/356 2024-01-17 08:00:55 +00:00

			`@pytest.mark.requires("rank_bm25")`
			`def test_repr() -> None:`
			`input_docs = [`
			`Document(page_content="I have a pen."),`
			`Document(page_content="Do you have a pen?"),`
			`Document(page_content="I have a bag."),`
			`]`
			`bm25_retriever = BM25Retriever.from_documents(documents=input_docs)`
			`assert "I have a pen" not in repr(bm25_retriever)`