From 9281841cfe8ce9acc06df1aeffece5614d687ad0 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Apr 2024 22:00:08 +0800 Subject: [PATCH] community[patch]: fix integrated test case test_recursive_url_loader.py assertions (issue-20919) (#20920) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Description:** Fix integrated test case test_recursive_url_loader.py Local testing successful ```shell (venv) lei@LeideMacBook-Pro community % poetry run pytest tests/integration_tests/document_loaders/test_recursive_url_loader.py ================================================================================ test session starts ================================================================================ platform darwin -- Python 3.11.4, pytest-7.4.4, pluggy-1.4.0 -- /Users/zhanglei/Work/github/langchain/venv/bin/python cachedir: .pytest_cache rootdir: /Users/zhanglei/Work/github/langchain/libs/community configfile: pyproject.toml plugins: syrupy-4.6.1, asyncio-0.20.3, cov-4.1.0, vcr-1.0.2, mock-3.12.0, anyio-3.7.1, dotenv-0.5.2, requests-mock-1.11.0, socket-0.6.0 asyncio: mode=Mode.AUTO collected 6 items tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader PASSED [ 16%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader_deterministic PASSED [ 33%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_recursive_url_loader FAILED [ 50%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_equivalent PASSED [ 66%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_loading_invalid_url PASSED [ 83%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_metadata_necessary_properties PASSED [100%] ===================================================================================== FAILURES ====================================================================================== __________________________________________________________________________ test_sync_recursive_url_loader ___________________________________________________________________________ def test_sync_recursive_url_loader() -> None: url = "https://docs.python.org/3.9/" loader = RecursiveUrlLoader( url, extractor=lambda _: "placeholder", use_async=False, max_depth=2 ) docs = loader.load() > assert len(docs) == 23 E AssertionError: assert 24 == 23 E + where 24 = len([Document(page_content='placeholder', metadata={'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.18 Documentation', 'language': None}), Document(page_content='placeholder', metadata={'source': 'https://docs.python.org/3.9/py-modindex.html', 'content_type': 'text/html', 'title': 'Python Module Index — Python 3.9.18 documentation', 'language': None}), Document(page_content='placeholder', metadata={'source': 'https://docs.python.org/3.9/download.html', 'content_type': 'text/html', 'title': 'Download — Python 3.9.18 documentation', 'language': None}), Document(page_content='placeholder', metadata={'source': 'https://docs.python.org/3.9/howto/index.html', 'content_type': 'text/html', 'title': 'Python HOWTOs — Python 3.9.18 documentation', 'language': None}), Document(page_content='placeholder', metadata={'source': 'https://docs.python.org/3.9/whatsnew/index.html', 'content_type': 'text/html', 'title': 'Whatâ\x80\x99s New in Python — Python 3.9.18 documentation', 'language': None}), Document(page_content='placeholder', metadata={'source': 'https://docs.python.org/3.9/c-api/index.html', 'content_type': 'text/html', 'title': 'Python/C API Reference Manual — Python 3.9.18 documentation', 'language': None}), ...]) tests/integration_tests/document_loaders/test_recursive_url_loader.py:38: AssertionError ================================================================================= warnings summary ================================================================================== tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader_deterministic tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_recursive_url_loader tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_equivalent tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_metadata_necessary_properties /Users/zhanglei/.pyenv/versions/3.11.4/lib/python3.11/html/parser.py:170: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor. k = self.parse_starttag(i) -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html ================================================================================ slowest 5 durations ================================================================================ 56.75s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader_deterministic 38.99s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader 31.20s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_metadata_necessary_properties 30.37s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_equivalent 15.44s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_recursive_url_loader ============================================================================== short test summary info ============================================================================== FAILED tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_recursive_url_loader - AssertionError: assert 24 == 23 ================================================================ 1 failed, 5 passed, 5 warnings in 172.97s (0:02:52) ================================================================ (venv) zhanglei@LeideMacBook-Pro community % poetry run pytest tests/integration_tests/document_loaders/test_recursive_url_loader.py ================================================================================ test session starts ================================================================================ platform darwin -- Python 3.11.4, pytest-7.4.4, pluggy-1.4.0 -- /Users/zhanglei/Work/github/langchain/venv/bin/python cachedir: .pytest_cache rootdir: /Users/zhanglei/Work/github/langchain/libs/community configfile: pyproject.toml plugins: syrupy-4.6.1, asyncio-0.20.3, cov-4.1.0, vcr-1.0.2, mock-3.12.0, anyio-3.7.1, dotenv-0.5.2, requests-mock-1.11.0, socket-0.6.0 asyncio: mode=Mode.AUTO collected 6 items tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader PASSED [ 16%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader_deterministic PASSED [ 33%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_recursive_url_loader PASSED [ 50%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_equivalent PASSED [ 66%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_loading_invalid_url PASSED [ 83%] tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_metadata_necessary_properties PASSED [100%] ================================================================================= warnings summary ================================================================================== tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader_deterministic tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_recursive_url_loader tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_equivalent tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_metadata_necessary_properties /Users/zhanglei/.pyenv/versions/3.11.4/lib/python3.11/html/parser.py:170: XMLParsedAsHTMLWarning: It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor. k = self.parse_starttag(i) -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html ================================================================================ slowest 5 durations ================================================================================ 46.99s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader_deterministic 32.43s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_async_recursive_url_loader 31.23s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_equivalent 30.75s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_async_metadata_necessary_properties 15.89s call tests/integration_tests/document_loaders/test_recursive_url_loader.py::test_sync_recursive_url_loader ===================================================================== 6 passed, 5 warnings in 157.42s (0:02:37) ===================================================================== (venv) lei@LeideMacBook-Pro community % ``` **Issue:** https://github.com/langchain-ai/langchain/issues/20919 **Twitter handle:** @coolbeevip --- .../document_loaders/test_recursive_url_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py b/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py index 1f35993222..92c274f33e 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py +++ b/libs/community/tests/integration_tests/document_loaders/test_recursive_url_loader.py @@ -12,7 +12,7 @@ def test_async_recursive_url_loader() -> None: check_response_status=True, ) docs = loader.load() - assert len(docs) == 513 + assert len(docs) == 512 assert docs[0].page_content == "placeholder"