mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add security note to recursive url loader (#11934)
Add security note to recursive loader
This commit is contained in:
parent
42dcc502c7
commit
9ecb7240a4
@ -49,7 +49,36 @@ def _metadata_extractor(raw_html: str, url: str) -> dict:
|
|||||||
|
|
||||||
|
|
||||||
class RecursiveUrlLoader(BaseLoader):
|
class RecursiveUrlLoader(BaseLoader):
|
||||||
"""Load all child links from a URL page."""
|
"""Load all child links from a URL page.
|
||||||
|
|
||||||
|
**Security Note**: This loader is a crawler that will start crawling
|
||||||
|
at a given URL and then expand to crawl child links recursively.
|
||||||
|
|
||||||
|
Web crawlers should generally NOT be deployed with network access
|
||||||
|
to any internal servers.
|
||||||
|
|
||||||
|
Control access to who can submit crawling requests and what network access
|
||||||
|
the crawler has.
|
||||||
|
|
||||||
|
While crawling, the crawler may encounter malicious URLs that would lead to a
|
||||||
|
server-side request forgery (SSRF) attack.
|
||||||
|
|
||||||
|
To mitigate risks, the crawler by default will only load URLs from the same
|
||||||
|
domain as the start URL (controlled via prevent_outside named argument).
|
||||||
|
|
||||||
|
This will mitigate the risk of SSRF attacks, but will not eliminate it.
|
||||||
|
|
||||||
|
For example, if crawling a host which hosts several sites:
|
||||||
|
|
||||||
|
https://some_host/alice_site/
|
||||||
|
https://some_host/bob_site/
|
||||||
|
|
||||||
|
A malicious URL on Alice's site could cause the crawler to make a malicious
|
||||||
|
GET request to an endpoint on Bob's site. Both sites are hosted on the
|
||||||
|
same host, so such a request would not be prevented by default.
|
||||||
|
|
||||||
|
See https://python.langchain.com/docs/security
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -60,12 +89,13 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
metadata_extractor: Optional[Callable[[str, str], str]] = None,
|
metadata_extractor: Optional[Callable[[str, str], str]] = None,
|
||||||
exclude_dirs: Optional[Sequence[str]] = (),
|
exclude_dirs: Optional[Sequence[str]] = (),
|
||||||
timeout: Optional[int] = 10,
|
timeout: Optional[int] = 10,
|
||||||
prevent_outside: Optional[bool] = True,
|
prevent_outside: bool = True,
|
||||||
link_regex: Union[str, re.Pattern, None] = None,
|
link_regex: Union[str, re.Pattern, None] = None,
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
check_response_status: bool = False,
|
check_response_status: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with URL to crawl and any subdirectories to exclude.
|
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url: The URL to crawl.
|
url: The URL to crawl.
|
||||||
max_depth: The max depth of the recursive loading.
|
max_depth: The max depth of the recursive loading.
|
||||||
|
Loading…
Reference in New Issue
Block a user