Add security note to recursive url loader (#11934)

Add security note to recursive loader
2024-11-06 03:20:49 +00:00 · 2023-10-17 13:41:43 -04:00 · 2023-10-17 13:41:43 -04:00 · 9ecb7240a4
commit 9ecb7240a4
parent 42dcc502c7
1 changed files with 32 additions and 2 deletions
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@ -49,7 +49,36 @@ def _metadata_extractor(raw_html: str, url: str) -> dict:
 class RecursiveUrlLoader(BaseLoader):
-    """Load all child links from a URL page."""
+    """Load all child links from a URL page.
    **Security Note**: This loader is a crawler that will start crawling
        at a given URL and then expand to crawl child links recursively.
        Web crawlers should generally NOT be deployed with network access
        to any internal servers.
        Control access to who can submit crawling requests and what network access
        the crawler has.
        While crawling, the crawler may encounter malicious URLs that would lead to a
        server-side request forgery (SSRF) attack.
        To mitigate risks, the crawler by default will only load URLs from the same
        domain as the start URL (controlled via prevent_outside named argument).
        This will mitigate the risk of SSRF attacks, but will not eliminate it.
        For example, if crawling a host which hosts several sites:
        https://some_host/alice_site/
        https://some_host/bob_site/
        A malicious URL on Alice's site could cause the crawler to make a malicious
        GET request to an endpoint on Bob's site. Both sites are hosted on the
        same host, so such a request would not be prevented by default.
        See https://python.langchain.com/docs/security
    """
    def __init__(
        self,
@ -60,12 +89,13 @@ class RecursiveUrlLoader(BaseLoader):
        metadata_extractor: Optional[Callable[[str, str], str]] = None,
        exclude_dirs: Optional[Sequence[str]] = (),
        timeout: Optional[int] = 10,
-        prevent_outside: Optional[bool] = True,
+        prevent_outside: bool = True,
        link_regex: Union[str, re.Pattern, None] = None,
        headers: Optional[dict] = None,
        check_response_status: bool = False,
    ) -> None:
        """Initialize with URL to crawl and any subdirectories to exclude.
        Args:
            url: The URL to crawl.
            max_depth: The max depth of the recursive loading.