Recursive url loader w/ test (#8813)

Description: Due to some issue on the test, this is a separate PR with
the test for #8502

Tag maintainer: @rlancemartin

---------

Co-authored-by: Lance Martin <lance@langchain.dev>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Zend 2023-08-11 05:50:31 +08:00 committed by GitHub
parent cb5fb751e9
commit 6221eb5974
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -0,0 +1,30 @@
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
def test_async_recursive_url_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url=url, extractor=lambda _: "placeholder", use_async=True, max_depth=1
)
docs = loader.load()
assert len(docs) == 24
assert docs[0].page_content == "placeholder"
def test_sync_recursive_url_loader() -> None:
url = "https://docs.python.org/3.9/"
loader = RecursiveUrlLoader(
url=url, extractor=lambda _: "placeholder", use_async=False, max_depth=1
)
docs = loader.load()
assert len(docs) == 24
assert docs[0].page_content == "placeholder"
def test_loading_invalid_url() -> None:
url = "https://this.url.is.invalid/this/is/a/test"
loader = RecursiveUrlLoader(
url=url, max_depth=1, extractor=lambda _: "placeholder", use_async=False
)
docs = loader.load()
assert len(docs) == 0