From 9ecb7240a480720ec9d739b3877a52f76098a2b8 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Tue, 17 Oct 2023 13:41:43 -0400 Subject: [PATCH] Add security note to recursive url loader (#11934) Add security note to recursive loader --- .../document_loaders/recursive_url_loader.py | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py index 60ee27013e..dc6a66df81 100644 --- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py +++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py @@ -49,7 +49,36 @@ def _metadata_extractor(raw_html: str, url: str) -> dict: class RecursiveUrlLoader(BaseLoader): - """Load all child links from a URL page.""" + """Load all child links from a URL page. + + **Security Note**: This loader is a crawler that will start crawling + at a given URL and then expand to crawl child links recursively. + + Web crawlers should generally NOT be deployed with network access + to any internal servers. + + Control access to who can submit crawling requests and what network access + the crawler has. + + While crawling, the crawler may encounter malicious URLs that would lead to a + server-side request forgery (SSRF) attack. + + To mitigate risks, the crawler by default will only load URLs from the same + domain as the start URL (controlled via prevent_outside named argument). + + This will mitigate the risk of SSRF attacks, but will not eliminate it. + + For example, if crawling a host which hosts several sites: + + https://some_host/alice_site/ + https://some_host/bob_site/ + + A malicious URL on Alice's site could cause the crawler to make a malicious + GET request to an endpoint on Bob's site. Both sites are hosted on the + same host, so such a request would not be prevented by default. + + See https://python.langchain.com/docs/security + """ def __init__( self, @@ -60,12 +89,13 @@ class RecursiveUrlLoader(BaseLoader): metadata_extractor: Optional[Callable[[str, str], str]] = None, exclude_dirs: Optional[Sequence[str]] = (), timeout: Optional[int] = 10, - prevent_outside: Optional[bool] = True, + prevent_outside: bool = True, link_regex: Union[str, re.Pattern, None] = None, headers: Optional[dict] = None, check_response_status: bool = False, ) -> None: """Initialize with URL to crawl and any subdirectories to exclude. + Args: url: The URL to crawl. max_depth: The max depth of the recursive loading.