langchain/libs/community/tests/unit_tests/retrievers/test_bm25.py

46 lines
1.5 KiB
Python

import pytest
from langchain_core.documents import Document
from langchain_community.retrievers.bm25 import BM25Retriever
@pytest.mark.requires("rank_bm25")
def test_from_texts() -> None:
input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
bm25_retriever = BM25Retriever.from_texts(texts=input_texts)
assert len(bm25_retriever.docs) == 3
assert bm25_retriever.vectorizer.doc_len == [4, 5, 4]
@pytest.mark.requires("rank_bm25")
def test_from_texts_with_bm25_params() -> None:
input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
bm25_retriever = BM25Retriever.from_texts(
texts=input_texts, bm25_params={"epsilon": 10}
)
# should count only multiple words (have, pan)
assert bm25_retriever.vectorizer.epsilon == 10
@pytest.mark.requires("rank_bm25")
def test_from_documents() -> None:
input_docs = [
Document(page_content="I have a pen."),
Document(page_content="Do you have a pen?"),
Document(page_content="I have a bag."),
]
bm25_retriever = BM25Retriever.from_documents(documents=input_docs)
assert len(bm25_retriever.docs) == 3
assert bm25_retriever.vectorizer.doc_len == [4, 5, 4]
@pytest.mark.requires("rank_bm25")
def test_repr() -> None:
input_docs = [
Document(page_content="I have a pen."),
Document(page_content="Do you have a pen?"),
Document(page_content="I have a bag."),
]
bm25_retriever = BM25Retriever.from_documents(documents=input_docs)
assert "I have a pen" not in repr(bm25_retriever)