From 9ecb7240a480720ec9d739b3877a52f76098a2b8 Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev <eyurtsev@gmail.com>
Date: Tue, 17 Oct 2023 13:41:43 -0400
Subject: [PATCH] Add security note to recursive url loader (#11934)

Add security note to recursive loader
---
 .../document_loaders/recursive_url_loader.py  | 34 +++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
index 60ee27013e..dc6a66df81 100644
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@@ -49,7 +49,36 @@ def _metadata_extractor(raw_html: str, url: str) -> dict:
 
 
 class RecursiveUrlLoader(BaseLoader):
-    """Load all child links from a URL page."""
+    """Load all child links from a URL page.
+
+    **Security Note**: This loader is a crawler that will start crawling
+        at a given URL and then expand to crawl child links recursively.
+
+        Web crawlers should generally NOT be deployed with network access
+        to any internal servers.
+
+        Control access to who can submit crawling requests and what network access
+        the crawler has.
+
+        While crawling, the crawler may encounter malicious URLs that would lead to a
+        server-side request forgery (SSRF) attack.
+
+        To mitigate risks, the crawler by default will only load URLs from the same
+        domain as the start URL (controlled via prevent_outside named argument).
+
+        This will mitigate the risk of SSRF attacks, but will not eliminate it.
+
+        For example, if crawling a host which hosts several sites:
+
+        https://some_host/alice_site/
+        https://some_host/bob_site/
+
+        A malicious URL on Alice's site could cause the crawler to make a malicious
+        GET request to an endpoint on Bob's site. Both sites are hosted on the
+        same host, so such a request would not be prevented by default.
+
+        See https://python.langchain.com/docs/security
+    """
 
     def __init__(
         self,
@@ -60,12 +89,13 @@ class RecursiveUrlLoader(BaseLoader):
         metadata_extractor: Optional[Callable[[str, str], str]] = None,
         exclude_dirs: Optional[Sequence[str]] = (),
         timeout: Optional[int] = 10,
-        prevent_outside: Optional[bool] = True,
+        prevent_outside: bool = True,
         link_regex: Union[str, re.Pattern, None] = None,
         headers: Optional[dict] = None,
         check_response_status: bool = False,
     ) -> None:
         """Initialize with URL to crawl and any subdirectories to exclude.
+
         Args:
             url: The URL to crawl.
             max_depth: The max depth of the recursive loading.