recursive loader add status check (#10891)

12 months ago · c1f9cc0bc5
parent 6e02c45ca4
commit c1f9cc0bc5
2 changed files with 15 additions and 3 deletions
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@ -63,6 +63,7 @@ class RecursiveUrlLoader(BaseLoader):
        prevent_outside: Optional[bool] = True,
        link_regex: Union[str, re.Pattern, None] = None,
        headers: Optional[dict] = None,
        check_response_status: bool = False,
    ) -> None:
        """Initialize with URL to crawl and any subdirectories to exclude.
        Args:
@ -84,6 +85,8 @@ class RecursiveUrlLoader(BaseLoader):
            prevent_outside: If True, prevent loading from urls which are not children
                of the root url.
            link_regex: Regex for extracting sub-links from the raw html of a web page.
            check_response_status: If True, check HTTP response status and skip
                URLs with error responses (400-599).
        """
        self.url = url
@ -101,6 +104,7 @@ class RecursiveUrlLoader(BaseLoader):
        self.link_regex = link_regex
        self._lock = asyncio.Lock() if self.use_async else None
        self.headers = headers
        self.check_response_status = check_response_status
    def _get_child_links_recursive(
        self, url: str, visited: Set[str], *, depth: int = 0
@ -123,8 +127,13 @@ class RecursiveUrlLoader(BaseLoader):
        visited.add(url)
        try:
            response = requests.get(url, timeout=self.timeout, headers=self.headers)
-        except Exception:
+            if self.check_response_status and 400 <= response.status_code <= 599:
-            logger.warning(f"Unable to load from {url}")
+                raise ValueError(f"Received HTTP status {response.status_code}")
        except Exception as e:
            logger.warning(
                f"Unable to load from {url}. Received error {e} of type "
                f"{e.__class__.__name__}"
            )
            return
        content = self.extractor(response.text)
        if content:
@ -193,6 +202,8 @@ class RecursiveUrlLoader(BaseLoader):
        try:
            async with session.get(url) as response:
                text = await response.text()
                if self.check_response_status and 400 <= response.status <= 599:
                    raise ValueError(f"Received HTTP status {response.status}")
        except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
            logger.warning(
                f"Unable to load {url}. Received error {e} of type "
--- a/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/test_recursive_url_loader.py
@ -12,9 +12,10 @@ def test_async_recursive_url_loader() -> None:
        use_async=True,
        max_depth=3,
        timeout=None,
        check_response_status=True,
    )
    docs = loader.load()
-    assert len(docs) == 890
+    assert len(docs) == 513
    assert docs[0].page_content == "placeholder"