From 90e9ec6962dcb9bb6500007d95a11f6b2aee877c Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Tue, 17 Oct 2023 13:19:27 -0400 Subject: [PATCH] Sitemap specify default filter url (#11925) Specify default filter URL in sitemap loader and add a security note --------- Co-authored-by: Predrag Gruevski <2348618+obi1kenobi@users.noreply.github.com> --- .../langchain/document_loaders/sitemap.py | 79 +++++++++++++++++-- .../document_loaders/test_sitemap.py | 43 +++++++--- 2 files changed, 103 insertions(+), 19 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/sitemap.py b/libs/langchain/langchain/document_loaders/sitemap.py index a5441b2e0f..d0f0584d7b 100644 --- a/libs/langchain/langchain/document_loaders/sitemap.py +++ b/libs/langchain/langchain/document_loaders/sitemap.py @@ -1,6 +1,7 @@ import itertools import re -from typing import Any, Callable, Generator, Iterable, List, Optional +from typing import Any, Callable, Generator, Iterable, List, Optional, Tuple +from urllib.parse import urlparse from langchain.document_loaders.web_base import WebBaseLoader from langchain.schema import Document @@ -20,8 +21,47 @@ def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, N yield item +def _extract_scheme_and_domain(url: str) -> Tuple[str, str]: + """Extract the scheme + domain from a given URL. + + Args: + url (str): The input URL. + + Returns: + return a 2-tuple of scheme and domain + """ + parsed_uri = urlparse(url) + return parsed_uri.scheme, parsed_uri.netloc + + class SitemapLoader(WebBaseLoader): - """Load a sitemap and its URLs.""" + """Load a sitemap and its URLs. + + **Security Note**: This loader can be used to load all URLs specified in a sitemap. + If a malicious actor gets access to the sitemap, they could force + the server to load URLs from other domains by modifying the sitemap. + This could lead to server-side request forgery (SSRF) attacks; e.g., + with the attacker forcing the server to load URLs from internal + service endpoints that are not publicly accessible. While the attacker + may not immediately gain access to this data, this data could leak + into downstream systems (e.g., data loader is used to load data for indexing). + + This loader is a crawler and web crawlers should generally NOT be deployed + with network access to any internal servers. + + Control access to who can submit crawling requests and what network access + the crawler has. + + By default, the loader will only load URLs from the same domain as the sitemap + if the site map is not a local file. This can be disabled by setting + restrict_to_same_domain to False (not recommended). + + If the site map is a local file, no such risk mitigation is applied by default. + + Use the filter URLs argument to limit which URLs can be loaded. + + See https://python.langchain.com/docs/security + """ def __init__( self, @@ -33,14 +73,22 @@ class SitemapLoader(WebBaseLoader): meta_function: Optional[Callable] = None, is_local: bool = False, continue_on_failure: bool = False, + restrict_to_same_domain: bool = True, **kwargs: Any, ): """Initialize with webpage path and optional filter URLs. Args: web_path: url of the sitemap. can also be a local path - filter_urls: list of strings or regexes that will be applied to filter the - urls that are parsed and loaded + filter_urls: a list of regexes. If specified, only + URLS that match one of the filter URLs will be loaded. + *WARNING* The filter URLs are interpreted as regular expressions. + Remember to escape special characters if you do not want them to be + interpreted as regular expression syntax. For example, `.` appears + frequently in URLs and should be escaped if you want to match a literal + `.` rather than any character. + restrict_to_same_domain takes precedence over filter_urls when + restrict_to_same_domain is True and the sitemap is not a local file. parsing_function: Function to parse bs4.Soup output blocksize: number of sitemap locations per block blocknum: the number of the block that should be loaded - zero indexed. @@ -53,6 +101,9 @@ class SitemapLoader(WebBaseLoader): occurs loading a url, emitting a warning instead of raising an exception. Setting this to True makes the loader more robust, but also may result in missing data. Default: False + restrict_to_same_domain: whether to restrict loading to URLs to the same + domain as the sitemap. Attention: This is only applied if the sitemap + is not a local file! """ if blocksize is not None and blocksize < 1: @@ -65,12 +116,17 @@ class SitemapLoader(WebBaseLoader): import lxml # noqa:F401 except ImportError: raise ImportError( - "lxml package not found, please install it with " "`pip install lxml`" + "lxml package not found, please install it with `pip install lxml`" ) super().__init__(web_paths=[web_path], **kwargs) - self.filter_urls = filter_urls + # Define a list of URL patterns (interpreted as regular expressions) that + # will be allowed to be loaded. + # restrict_to_same_domain takes precedence over filter_urls when + # restrict_to_same_domain is True and the sitemap is not a local file. + self.allow_url_patterns = filter_urls + self.restrict_to_same_domain = restrict_to_same_domain self.parsing_function = parsing_function or _default_parsing_function self.meta_function = meta_function or _default_meta_function self.blocksize = blocksize @@ -96,8 +152,15 @@ class SitemapLoader(WebBaseLoader): # Strip leading and trailing whitespace and newlines loc_text = loc.text.strip() - if self.filter_urls and not any( - re.match(r, loc_text) for r in self.filter_urls + if self.restrict_to_same_domain and not self.is_local: + if _extract_scheme_and_domain(loc_text) != _extract_scheme_and_domain( + self.web_path + ): + continue + + if self.allow_url_patterns and not any( + re.match(regexp_pattern, loc_text) + for regexp_pattern in self.allow_url_patterns ): continue diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_sitemap.py b/libs/langchain/tests/integration_tests/document_loaders/test_sitemap.py index c88d53f74c..7ab448dfe7 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_sitemap.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_sitemap.py @@ -4,11 +4,12 @@ from typing import Any import pytest from langchain.document_loaders import SitemapLoader +from langchain.document_loaders.sitemap import _extract_scheme_and_domain def test_sitemap() -> None: """Test sitemap loader.""" - loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml") + loader = SitemapLoader("https://api.python.langchain.com/sitemap.xml") documents = loader.load() assert len(documents) > 1 assert "LangChain Python API" in documents[0].page_content @@ -17,7 +18,7 @@ def test_sitemap() -> None: def test_sitemap_block() -> None: """Test sitemap loader.""" loader = SitemapLoader( - "https://langchain.readthedocs.io/sitemap.xml", blocksize=1, blocknum=1 + "https://api.python.langchain.com/sitemap.xml", blocksize=1, blocknum=1 ) documents = loader.load() assert len(documents) == 1 @@ -27,7 +28,7 @@ def test_sitemap_block() -> None: def test_sitemap_block_only_one() -> None: """Test sitemap loader.""" loader = SitemapLoader( - "https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000, blocknum=0 + "https://api.python.langchain.com/sitemap.xml", blocksize=1000000, blocknum=0 ) documents = loader.load() assert len(documents) > 1 @@ -37,7 +38,7 @@ def test_sitemap_block_only_one() -> None: def test_sitemap_block_blocknum_default() -> None: """Test sitemap loader.""" loader = SitemapLoader( - "https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000 + "https://api.python.langchain.com/sitemap.xml", blocksize=1000000 ) documents = loader.load() assert len(documents) > 1 @@ -47,14 +48,14 @@ def test_sitemap_block_blocknum_default() -> None: def test_sitemap_block_size_to_small() -> None: """Test sitemap loader.""" with pytest.raises(ValueError, match="Sitemap blocksize should be at least 1"): - SitemapLoader("https://langchain.readthedocs.io/sitemap.xml", blocksize=0) + SitemapLoader("https://api.python.langchain.com/sitemap.xml", blocksize=0) def test_sitemap_block_num_to_small() -> None: """Test sitemap loader.""" with pytest.raises(ValueError, match="Sitemap blocknum can not be lower then 0"): SitemapLoader( - "https://langchain.readthedocs.io/sitemap.xml", + "https://api.python.langchain.com/sitemap.xml", blocksize=1000000, blocknum=-1, ) @@ -63,7 +64,7 @@ def test_sitemap_block_num_to_small() -> None: def test_sitemap_block_does_not_exists() -> None: """Test sitemap loader.""" loader = SitemapLoader( - "https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000, blocknum=15 + "https://api.python.langchain.com/sitemap.xml", blocksize=1000000, blocknum=15 ) with pytest.raises( ValueError, @@ -75,7 +76,7 @@ def test_sitemap_block_does_not_exists() -> None: def test_filter_sitemap() -> None: """Test sitemap loader.""" loader = SitemapLoader( - "https://langchain.readthedocs.io/sitemap.xml", + "https://api.python.langchain.com/sitemap.xml", filter_urls=["https://api.python.langchain.com/en/stable/"], ) documents = loader.load() @@ -89,7 +90,7 @@ def test_sitemap_metadata() -> None: """Test sitemap loader.""" loader = SitemapLoader( - "https://langchain.readthedocs.io/sitemap.xml", + "https://api.python.langchain.com/sitemap.xml", meta_function=sitemap_metadata_one, ) documents = loader.load() @@ -107,7 +108,7 @@ def test_sitemap_metadata_extraction() -> None: """Test sitemap loader.""" loader = SitemapLoader( - "https://langchain.readthedocs.io/sitemap.xml", + "https://api.python.langchain.com/sitemap.xml", meta_function=sitemap_metadata_two, ) documents = loader.load() @@ -118,7 +119,7 @@ def test_sitemap_metadata_extraction() -> None: def test_sitemap_metadata_default() -> None: """Test sitemap loader.""" - loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml") + loader = SitemapLoader("https://api.python.langchain.com/sitemap.xml") documents = loader.load() assert len(documents) > 1 assert "source" in documents[0].metadata @@ -132,3 +133,23 @@ def test_local_sitemap() -> None: documents = loader.load() assert len(documents) > 1 assert "🦜️🔗" in documents[0].page_content + + +def test_extract_domain() -> None: + """Test domain extraction.""" + assert _extract_scheme_and_domain("https://js.langchain.com/sitemap.xml") == ( + "https", + "js.langchain.com", + ) + assert _extract_scheme_and_domain("http://example.com/path/to/page") == ( + "http", + "example.com", + ) + assert _extract_scheme_and_domain("ftp://files.example.com") == ( + "ftp", + "files.example.com", + ) + assert _extract_scheme_and_domain("https://deep.subdomain.example.com") == ( + "https", + "deep.subdomain.example.com", + )