langchain/tests/unit_tests/retrievers/test_tfidf.py

import pytest

from langchain.retrievers.tfidf import TFIDFRetriever
from langchain.schema import Document


@pytest.mark.requires("sklearn")
def test_from_texts() -> None:
    input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
    tfidf_retriever = TFIDFRetriever.from_texts(texts=input_texts)
    assert len(tfidf_retriever.docs) == 3
    assert tfidf_retriever.tfidf_array.toarray().shape == (3, 5)


@pytest.mark.requires("sklearn")
def test_from_texts_with_tfidf_params() -> None:
    input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
    tfidf_retriever = TFIDFRetriever.from_texts(
        texts=input_texts, tfidf_params={"min_df": 2}
    )
    # should count only multiple words (have, pan)
    assert tfidf_retriever.tfidf_array.toarray().shape == (3, 2)


@pytest.mark.requires("sklearn")
def test_from_documents() -> None:
    input_docs = [
        Document(page_content="I have a pen."),
        Document(page_content="Do you have a pen?"),
        Document(page_content="I have a bag."),
    ]
    tfidf_retriever = TFIDFRetriever.from_documents(documents=input_docs)
    assert len(tfidf_retriever.docs) == 3
    assert tfidf_retriever.tfidf_array.toarray().shape == (3, 5)
tfidf retriever (#5114) Co-authored-by: vempaliakhil96 <vempaliakhil96@gmail.com> 2023-05-24 17:02:09 +00:00			`import pytest`

Harrison/tfidf parameters (#3481) Co-authored-by: pao <go5kuramubon@gmail.com> Co-authored-by: KyoHattori <kyo.hattori@abejainc.com> 2023-04-25 05:19:58 +00:00			`from langchain.retrievers.tfidf import TFIDFRetriever`
tfidf retriever (#5114) Co-authored-by: vempaliakhil96 <vempaliakhil96@gmail.com> 2023-05-24 17:02:09 +00:00			`from langchain.schema import Document`
Harrison/tfidf parameters (#3481) Co-authored-by: pao <go5kuramubon@gmail.com> Co-authored-by: KyoHattori <kyo.hattori@abejainc.com> 2023-04-25 05:19:58 +00:00

tfidf retriever (#5114) Co-authored-by: vempaliakhil96 <vempaliakhil96@gmail.com> 2023-05-24 17:02:09 +00:00			`@pytest.mark.requires("sklearn")`
Harrison/tfidf parameters (#3481) Co-authored-by: pao <go5kuramubon@gmail.com> Co-authored-by: KyoHattori <kyo.hattori@abejainc.com> 2023-04-25 05:19:58 +00:00			`def test_from_texts() -> None:`
			`input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]`
			`tfidf_retriever = TFIDFRetriever.from_texts(texts=input_texts)`
			`assert len(tfidf_retriever.docs) == 3`
			`assert tfidf_retriever.tfidf_array.toarray().shape == (3, 5)`


tfidf retriever (#5114) Co-authored-by: vempaliakhil96 <vempaliakhil96@gmail.com> 2023-05-24 17:02:09 +00:00			`@pytest.mark.requires("sklearn")`
Harrison/tfidf parameters (#3481) Co-authored-by: pao <go5kuramubon@gmail.com> Co-authored-by: KyoHattori <kyo.hattori@abejainc.com> 2023-04-25 05:19:58 +00:00			`def test_from_texts_with_tfidf_params() -> None:`
			`input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]`
			`tfidf_retriever = TFIDFRetriever.from_texts(`
			`texts=input_texts, tfidf_params={"min_df": 2}`
			`)`
			`# should count only multiple words (have, pan)`
			`assert tfidf_retriever.tfidf_array.toarray().shape == (3, 2)`
tfidf retriever (#5114) Co-authored-by: vempaliakhil96 <vempaliakhil96@gmail.com> 2023-05-24 17:02:09 +00:00

			`@pytest.mark.requires("sklearn")`
			`def test_from_documents() -> None:`
			`input_docs = [`
			`Document(page_content="I have a pen."),`
			`Document(page_content="Do you have a pen?"),`
			`Document(page_content="I have a bag."),`
			`]`
			`tfidf_retriever = TFIDFRetriever.from_documents(documents=input_docs)`
			`assert len(tfidf_retriever.docs) == 3`
			`assert tfidf_retriever.tfidf_array.toarray().shape == (3, 5)`