Add security note to recursive url loader (#11934)

Add security note to recursive loader
This commit is contained in:
Eugene Yurtsev 2023-10-17 13:41:43 -04:00 committed by GitHub
parent 42dcc502c7
commit 9ecb7240a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -49,7 +49,36 @@ def _metadata_extractor(raw_html: str, url: str) -> dict:
class RecursiveUrlLoader(BaseLoader): class RecursiveUrlLoader(BaseLoader):
"""Load all child links from a URL page.""" """Load all child links from a URL page.
**Security Note**: This loader is a crawler that will start crawling
at a given URL and then expand to crawl child links recursively.
Web crawlers should generally NOT be deployed with network access
to any internal servers.
Control access to who can submit crawling requests and what network access
the crawler has.
While crawling, the crawler may encounter malicious URLs that would lead to a
server-side request forgery (SSRF) attack.
To mitigate risks, the crawler by default will only load URLs from the same
domain as the start URL (controlled via prevent_outside named argument).
This will mitigate the risk of SSRF attacks, but will not eliminate it.
For example, if crawling a host which hosts several sites:
https://some_host/alice_site/
https://some_host/bob_site/
A malicious URL on Alice's site could cause the crawler to make a malicious
GET request to an endpoint on Bob's site. Both sites are hosted on the
same host, so such a request would not be prevented by default.
See https://python.langchain.com/docs/security
"""
def __init__( def __init__(
self, self,
@ -60,12 +89,13 @@ class RecursiveUrlLoader(BaseLoader):
metadata_extractor: Optional[Callable[[str, str], str]] = None, metadata_extractor: Optional[Callable[[str, str], str]] = None,
exclude_dirs: Optional[Sequence[str]] = (), exclude_dirs: Optional[Sequence[str]] = (),
timeout: Optional[int] = 10, timeout: Optional[int] = 10,
prevent_outside: Optional[bool] = True, prevent_outside: bool = True,
link_regex: Union[str, re.Pattern, None] = None, link_regex: Union[str, re.Pattern, None] = None,
headers: Optional[dict] = None, headers: Optional[dict] = None,
check_response_status: bool = False, check_response_status: bool = False,
) -> None: ) -> None:
"""Initialize with URL to crawl and any subdirectories to exclude. """Initialize with URL to crawl and any subdirectories to exclude.
Args: Args:
url: The URL to crawl. url: The URL to crawl.
max_depth: The max depth of the recursive loading. max_depth: The max depth of the recursive loading.