langchain/libs/community/tests/integration_tests/document_loaders/test_arxiv.py

import shutil
from http.client import HTTPMessage
from pathlib import Path
from typing import List, Union
from unittest.mock import patch
from urllib.error import HTTPError

import pytest
from langchain_core.documents import Document

from langchain_community.document_loaders.arxiv import ArxivLoader

EXAMPLE_HELLO_PDF_PATH = Path(__file__).parents[1] / "examples" / "hello.pdf"


def assert_docs(docs: List[Document]) -> None:
    for doc in docs:
        assert doc.page_content
        assert doc.metadata
        assert set(doc.metadata) == {"Published", "Title", "Authors", "Summary"}


def test_load_success() -> None:
    """Test that returns one document"""
    loader = ArxivLoader(query="1605.08386", load_max_docs=2)

    docs = loader.load()
    assert len(docs) == 1
    print(docs[0].metadata)  # noqa: T201
    print(docs[0].page_content)  # noqa: T201
    assert_docs(docs)


def test_load_returns_no_result() -> None:
    """Test that returns no docs"""
    loader = ArxivLoader(query="1605.08386WWW", load_max_docs=2)
    docs = loader.load()

    assert len(docs) == 0


def test_load_returns_limited_docs() -> None:
    """Test that returns several docs"""
    expected_docs = 2
    loader = ArxivLoader(query="ChatGPT", load_max_docs=expected_docs)
    docs = loader.load()

    assert len(docs) == expected_docs
    assert_docs(docs)


def test_load_returns_full_set_of_metadata() -> None:
    """Test that returns several docs"""
    loader = ArxivLoader(query="ChatGPT", load_max_docs=1, load_all_available_meta=True)
    docs = loader.load()
    assert len(docs) == 1
    for doc in docs:
        assert doc.page_content
        assert doc.metadata
        assert set(doc.metadata).issuperset(
            {"Published", "Title", "Authors", "Summary"}
        )
        print(doc.metadata)  # noqa: T201
        assert len(set(doc.metadata)) > 4


def test_skip_http_error() -> None:
    """Test skipping unexpected Http 404 error of a single doc"""
    tmp_hello_pdf_path = Path(__file__).parent / "hello.pdf"

    def first_download_fails() -> Union[HTTPError, str]:
        if not hasattr(first_download_fails, "firstCall"):
            first_download_fails.__setattr__("firstCall", False)
            raise HTTPError(
                url="", code=404, msg="Not Found", hdrs=HTTPMessage(), fp=None
            )
        else:
            # Return temporary example pdf path
            shutil.copy(EXAMPLE_HELLO_PDF_PATH, tmp_hello_pdf_path)
            return str(tmp_hello_pdf_path.absolute())

    with patch("arxiv.Result.download_pdf") as mock_download_pdf:
        # Set up the mock to raise HTTP 404 error
        mock_download_pdf.side_effect = first_download_fails
        # Load documents
        loader = ArxivLoader(
            query="ChatGPT",
            load_max_docs=2,
            load_all_available_meta=True,
            continue_on_failure=True,
        )
        docs = loader.load()
        # Only 1 of 2 documents should be loaded
        assert len(docs) == 1


@pytest.mark.skip(reason="test could be flaky")
def test_load_issue_9046() -> None:
    """Test for the fixed issue 9046"""
    expected_docs = 3

    # ":" character could not be an issue
    loader = ArxivLoader(
        query="MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
        load_max_docs=expected_docs,
    )
    docs = loader.load()

    assert_docs(docs)
    assert "MetaGPT" in docs[0].metadata["Title"]

    # "-" character could not be an issue
    loader = ArxivLoader(
        query="MetaGPT - Meta Programming for Multi-Agent Collaborative Framework",
        load_max_docs=expected_docs,
    )
    docs = loader.load()

    assert_docs(docs)
    assert "MetaGPT" in docs[0].metadata["Title"]
community[patch]: Skip unexpected 404 HTTP Error in Arxiv download (#21042) ### Description: When attempting to download PDF files from arXiv, an unexpected 404 error frequently occurs. This error halts the operation, regardless of whether there are additional documents to process. As a solution, I suggest implementing a mechanism to ignore and communicate this error and continue processing the next document from the list. Proposed Solution: To address the issue of unexpected 404 errors during PDF downloads from arXiv, I propose implementing the following solution: - Error Handling: Implement error handling mechanisms to catch and handle 404 errors gracefully. - Communication: Inform the user or logging system about the occurrence of the 404 error. - Continued Processing: After encountering a 404 error, continue processing the remaining documents from the list without interruption. This solution ensures that the application can handle unexpected errors without terminating the entire operation. It promotes resilience and robustness in the face of intermittent issues encountered during PDF downloads from arXiv. ### Issue: #20909 ### Dependencies: none --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-30 18:29:22 +00:00			`import shutil`
			`from http.client import HTTPMessage`
			`from pathlib import Path`
			`from typing import List, Union`
			`from unittest.mock import patch`
			`from urllib.error import HTTPError`
`Arxiv` document loader (#3627) It makes sense to use `arxiv` as another source of the documents for downloading. - Added the `arxiv` document_loader, based on the `utilities/arxiv.py:ArxivAPIWrapper` - added tests - added an example notebook - sorted `__all__` in `__init__.py` (otherwise it is hard to find a class in the very long list) 2023-04-27 04:04:56 +00:00
ArxivLoader fix for issue 9046 (#9061) Fixed #9046 Added ut-s for this fix. @eyurtsev 2023-08-10 18:59:39 +00:00			`import pytest`
REFACTOR: Refactor langchain_core (#13627) Changes: - remove langchain_core/schema since no clear distinction b/n schema and non-schema modules - make every module that doesn't end in -y plural - where easy have 1-2 classes per file - no more than one level of nesting in directories - only import from top level core modules in langchain 2023-11-21 16:35:29 +00:00			`from langchain_core.documents import Document`
ArxivLoader fix for issue 9046 (#9061) Fixed #9046 Added ut-s for this fix. @eyurtsev 2023-08-10 18:59:39 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`from langchain_community.document_loaders.arxiv import ArxivLoader`
`Arxiv` document loader (#3627) It makes sense to use `arxiv` as another source of the documents for downloading. - Added the `arxiv` document_loader, based on the `utilities/arxiv.py:ArxivAPIWrapper` - added tests - added an example notebook - sorted `__all__` in `__init__.py` (otherwise it is hard to find a class in the very long list) 2023-04-27 04:04:56 +00:00
community[patch]: Skip unexpected 404 HTTP Error in Arxiv download (#21042) ### Description: When attempting to download PDF files from arXiv, an unexpected 404 error frequently occurs. This error halts the operation, regardless of whether there are additional documents to process. As a solution, I suggest implementing a mechanism to ignore and communicate this error and continue processing the next document from the list. Proposed Solution: To address the issue of unexpected 404 errors during PDF downloads from arXiv, I propose implementing the following solution: - Error Handling: Implement error handling mechanisms to catch and handle 404 errors gracefully. - Communication: Inform the user or logging system about the occurrence of the 404 error. - Continued Processing: After encountering a 404 error, continue processing the remaining documents from the list without interruption. This solution ensures that the application can handle unexpected errors without terminating the entire operation. It promotes resilience and robustness in the face of intermittent issues encountered during PDF downloads from arXiv. ### Issue: #20909 ### Dependencies: none --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-30 18:29:22 +00:00			`EXAMPLE_HELLO_PDF_PATH = Path(__file__).parents[1] / "examples" / "hello.pdf"`

`Arxiv` document loader (#3627) It makes sense to use `arxiv` as another source of the documents for downloading. - Added the `arxiv` document_loader, based on the `utilities/arxiv.py:ArxivAPIWrapper` - added tests - added an example notebook - sorted `__all__` in `__init__.py` (otherwise it is hard to find a class in the very long list) 2023-04-27 04:04:56 +00:00
			`def assert_docs(docs: List[Document]) -> None:`
			`for doc in docs:`
			`assert doc.page_content`
			`assert doc.metadata`
			`assert set(doc.metadata) == {"Published", "Title", "Authors", "Summary"}`


			`def test_load_success() -> None:`
			`"""Test that returns one document"""`
			`loader = ArxivLoader(query="1605.08386", load_max_docs=2)`

			`docs = loader.load()`
			`assert len(docs) == 1`
infra: add print rule to ruff (#16221) Added noqa for existing prints. Can slowly remove / will prevent more being intro'd 2024-02-10 00:13:30 +00:00			`print(docs[0].metadata) # noqa: T201`
			`print(docs[0].page_content) # noqa: T201`
`Arxiv` document loader (#3627) It makes sense to use `arxiv` as another source of the documents for downloading. - Added the `arxiv` document_loader, based on the `utilities/arxiv.py:ArxivAPIWrapper` - added tests - added an example notebook - sorted `__all__` in `__init__.py` (otherwise it is hard to find a class in the very long list) 2023-04-27 04:04:56 +00:00			`assert_docs(docs)`


			`def test_load_returns_no_result() -> None:`
			`"""Test that returns no docs"""`
			`loader = ArxivLoader(query="1605.08386WWW", load_max_docs=2)`
			`docs = loader.load()`

			`assert len(docs) == 0`


			`def test_load_returns_limited_docs() -> None:`
			`"""Test that returns several docs"""`
			`expected_docs = 2`
			`loader = ArxivLoader(query="ChatGPT", load_max_docs=expected_docs)`
			`docs = loader.load()`

			`assert len(docs) == expected_docs`
			`assert_docs(docs)`


			`def test_load_returns_full_set_of_metadata() -> None:`
			`"""Test that returns several docs"""`
			`loader = ArxivLoader(query="ChatGPT", load_max_docs=1, load_all_available_meta=True)`
			`docs = loader.load()`
			`assert len(docs) == 1`
			`for doc in docs:`
			`assert doc.page_content`
			`assert doc.metadata`
			`assert set(doc.metadata).issuperset(`
			`{"Published", "Title", "Authors", "Summary"}`
			`)`
infra: add print rule to ruff (#16221) Added noqa for existing prints. Can slowly remove / will prevent more being intro'd 2024-02-10 00:13:30 +00:00			`print(doc.metadata) # noqa: T201`
`Arxiv` document loader (#3627) It makes sense to use `arxiv` as another source of the documents for downloading. - Added the `arxiv` document_loader, based on the `utilities/arxiv.py:ArxivAPIWrapper` - added tests - added an example notebook - sorted `__all__` in `__init__.py` (otherwise it is hard to find a class in the very long list) 2023-04-27 04:04:56 +00:00			`assert len(set(doc.metadata)) > 4`
ArxivLoader fix for issue 9046 (#9061) Fixed #9046 Added ut-s for this fix. @eyurtsev 2023-08-10 18:59:39 +00:00

community[patch]: Skip unexpected 404 HTTP Error in Arxiv download (#21042) ### Description: When attempting to download PDF files from arXiv, an unexpected 404 error frequently occurs. This error halts the operation, regardless of whether there are additional documents to process. As a solution, I suggest implementing a mechanism to ignore and communicate this error and continue processing the next document from the list. Proposed Solution: To address the issue of unexpected 404 errors during PDF downloads from arXiv, I propose implementing the following solution: - Error Handling: Implement error handling mechanisms to catch and handle 404 errors gracefully. - Communication: Inform the user or logging system about the occurrence of the 404 error. - Continued Processing: After encountering a 404 error, continue processing the remaining documents from the list without interruption. This solution ensures that the application can handle unexpected errors without terminating the entire operation. It promotes resilience and robustness in the face of intermittent issues encountered during PDF downloads from arXiv. ### Issue: #20909 ### Dependencies: none --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-30 18:29:22 +00:00			`def test_skip_http_error() -> None:`
			`"""Test skipping unexpected Http 404 error of a single doc"""`
			`tmp_hello_pdf_path = Path(__file__).parent / "hello.pdf"`

			`def first_download_fails() -> Union[HTTPError, str]:`
			`if not hasattr(first_download_fails, "firstCall"):`
			`first_download_fails.__setattr__("firstCall", False)`
			`raise HTTPError(`
			`url="", code=404, msg="Not Found", hdrs=HTTPMessage(), fp=None`
			`)`
			`else:`
			`# Return temporary example pdf path`
			`shutil.copy(EXAMPLE_HELLO_PDF_PATH, tmp_hello_pdf_path)`
			`return str(tmp_hello_pdf_path.absolute())`

			`with patch("arxiv.Result.download_pdf") as mock_download_pdf:`
			`# Set up the mock to raise HTTP 404 error`
			`mock_download_pdf.side_effect = first_download_fails`
			`# Load documents`
			`loader = ArxivLoader(`
			`query="ChatGPT",`
			`load_max_docs=2,`
			`load_all_available_meta=True,`
			`continue_on_failure=True,`
			`)`
			`docs = loader.load()`
			`# Only 1 of 2 documents should be loaded`
			`assert len(docs) == 1`


ArxivLoader fix for issue 9046 (#9061) Fixed #9046 Added ut-s for this fix. @eyurtsev 2023-08-10 18:59:39 +00:00			`@pytest.mark.skip(reason="test could be flaky")`
			`def test_load_issue_9046() -> None:`
			`"""Test for the fixed issue 9046"""`
			`expected_docs = 3`

			`# ":" character could not be an issue`
			`loader = ArxivLoader(`
			`query="MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",`
			`load_max_docs=expected_docs,`
			`)`
			`docs = loader.load()`

			`assert_docs(docs)`
			`assert "MetaGPT" in docs[0].metadata["Title"]`

			`# "-" character could not be an issue`
			`loader = ArxivLoader(`
			`query="MetaGPT - Meta Programming for Multi-Agent Collaborative Framework",`
			`load_max_docs=expected_docs,`
			`)`
			`docs = loader.load()`

			`assert_docs(docs)`
			`assert "MetaGPT" in docs[0].metadata["Title"]`