langchain/libs/community/tests/integration_tests/document_loaders/test_wikipedia.py

"""Integration test for Wikipedia Document Loader."""
from typing import List

from langchain_core.documents import Document

from langchain_community.document_loaders import WikipediaLoader


def assert_docs(docs: List[Document], all_meta: bool = False) -> None:
    for doc in docs:
        assert doc.page_content
        assert doc.metadata
        main_meta = {"title", "summary", "source"}
        assert set(doc.metadata).issuperset(main_meta)
        if all_meta:
            assert len(set(doc.metadata)) > len(main_meta)
        else:
            assert len(set(doc.metadata)) == len(main_meta)


def test_load_success() -> None:
    loader = WikipediaLoader(query="HUNTER X HUNTER")
    docs = loader.load()
    assert len(docs) > 1
    assert len(docs) <= 25
    assert_docs(docs, all_meta=False)


def test_load_success_all_meta() -> None:
    load_max_docs = 5
    load_all_available_meta = True
    loader = WikipediaLoader(
        query="HUNTER X HUNTER",
        load_max_docs=load_max_docs,
        load_all_available_meta=load_all_available_meta,
    )
    docs = loader.load()
    assert len(docs) == load_max_docs
    assert_docs(docs, all_meta=load_all_available_meta)


def test_load_success_more() -> None:
    load_max_docs = 10
    loader = WikipediaLoader(query="HUNTER X HUNTER", load_max_docs=load_max_docs)
    docs = loader.load()
    assert len(docs) == load_max_docs
    assert_docs(docs, all_meta=False)


def test_load_no_result() -> None:
    loader = WikipediaLoader(
        "NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL"
    )
    docs = loader.load()
    assert not docs
Repair Wikipedia document loader `load_max_docs` and improve test coverage. (#13769) Description: Repair Wikipedia document loader `load_max_docs` and improve test coverage. Issue: The Wikipedia document loader was not respecting the `load_max_docs` paramater (not reported) and would always return a maximum of 10 documents. This is because the API wrapper (in `utilities/wikipedia.py`) wasn't passing `top_k_results` to the underlying [Wikipedia library](https://wikipedia.readthedocs.io/en/latest/code.html#module-wikipedia). By default this library returns 10 results. The default number of results for the document loader has been reduced from 100 to 25. This is because loading 100 results takes a very long time and is an inconvenient default. It should possibly be 10. In addition, the documentation for the loader reported that there was a hard limit (300) on the number of documents returned. In actuality 300 is the maximum Wikipedia query character length set by the API wrapper. Tests have been added for the document loader (previously missing) and to test the correct numbers of documents are being returned by each class, both by default, and when overridden. Also repaired is the `assert_docs` test which has been updated to correctly test for the default metadata (which includes `source` in recent releases). Dependencies: nil Tag maintainer: @leo-gan Twitter handle: @queenvictoria 2023-11-29 01:26:40 +00:00			`"""Integration test for Wikipedia Document Loader."""`
			`from typing import List`

			`from langchain_core.documents import Document`

community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`from langchain_community.document_loaders import WikipediaLoader`
Repair Wikipedia document loader `load_max_docs` and improve test coverage. (#13769) Description: Repair Wikipedia document loader `load_max_docs` and improve test coverage. Issue: The Wikipedia document loader was not respecting the `load_max_docs` paramater (not reported) and would always return a maximum of 10 documents. This is because the API wrapper (in `utilities/wikipedia.py`) wasn't passing `top_k_results` to the underlying [Wikipedia library](https://wikipedia.readthedocs.io/en/latest/code.html#module-wikipedia). By default this library returns 10 results. The default number of results for the document loader has been reduced from 100 to 25. This is because loading 100 results takes a very long time and is an inconvenient default. It should possibly be 10. In addition, the documentation for the loader reported that there was a hard limit (300) on the number of documents returned. In actuality 300 is the maximum Wikipedia query character length set by the API wrapper. Tests have been added for the document loader (previously missing) and to test the correct numbers of documents are being returned by each class, both by default, and when overridden. Also repaired is the `assert_docs` test which has been updated to correctly test for the default metadata (which includes `source` in recent releases). Dependencies: nil Tag maintainer: @leo-gan Twitter handle: @queenvictoria 2023-11-29 01:26:40 +00:00

			`def assert_docs(docs: List[Document], all_meta: bool = False) -> None:`
			`for doc in docs:`
			`assert doc.page_content`
			`assert doc.metadata`
			`main_meta = {"title", "summary", "source"}`
			`assert set(doc.metadata).issuperset(main_meta)`
			`if all_meta:`
			`assert len(set(doc.metadata)) > len(main_meta)`
			`else:`
			`assert len(set(doc.metadata)) == len(main_meta)`


			`def test_load_success() -> None:`
			`loader = WikipediaLoader(query="HUNTER X HUNTER")`
			`docs = loader.load()`
			`assert len(docs) > 1`
			`assert len(docs) <= 25`
			`assert_docs(docs, all_meta=False)`


			`def test_load_success_all_meta() -> None:`
			`load_max_docs = 5`
			`load_all_available_meta = True`
			`loader = WikipediaLoader(`
			`query="HUNTER X HUNTER",`
			`load_max_docs=load_max_docs,`
			`load_all_available_meta=load_all_available_meta,`
			`)`
			`docs = loader.load()`
			`assert len(docs) == load_max_docs`
			`assert_docs(docs, all_meta=load_all_available_meta)`


			`def test_load_success_more() -> None:`
			`load_max_docs = 10`
			`loader = WikipediaLoader(query="HUNTER X HUNTER", load_max_docs=load_max_docs)`
			`docs = loader.load()`
			`assert len(docs) == load_max_docs`
			`assert_docs(docs, all_meta=False)`


			`def test_load_no_result() -> None:`
			`loader = WikipediaLoader(`
			`"NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL_NORESULTCALL"`
			`)`
			`docs = loader.load()`
			`assert not docs`