core[patch]: Further restrict recursive URL loader (#15559)

Includes code from this PR: https://github.com/langchain-ai/langchain/compare/HEAD...m0kr4n3:security/fix_ssrf with additional fixes Unit tests cover new test cases
2024-11-04 06:00:26 +00:00 · 2024-01-04 16:33:57 -05:00 · 2024-01-04 16:33:57 -05:00 · bf0b3cc0b5
commit bf0b3cc0b5
parent 817b84de9e
2 changed files with 52 additions and 11 deletions
--- a/libs/core/langchain_core/utils/html.py
+++ b/libs/core/langchain_core/utils/html.py
@ -67,23 +67,37 @@ def extract_sub_links(
    Returns:
        List[str]: sub links
    """
-    base_url = base_url if base_url is not None else url
+    base_url_to_use = base_url if base_url is not None else url
+    parsed_base_url = urlparse(base_url_to_use)
    all_links = find_all_links(raw_html, pattern=pattern)
    absolute_paths = set()
    for link in all_links:
+        parsed_link = urlparse(link)
        # Some may be absolute links like https://to/path
-        if link.startswith("http"):
-            absolute_paths.add(link)
+        if parsed_link.scheme == "http" or parsed_link.scheme == "https":
+            absolute_path = link
        # Some may have omitted the protocol like //to/path
        elif link.startswith("//"):
-            absolute_paths.add(f"{urlparse(url).scheme}:{link}")
+            absolute_path = f"{urlparse(url).scheme}:{link}"
        else:
-            absolute_paths.add(urljoin(url, link))
-    res = []
+            absolute_path = urljoin(url, parsed_link.path)
+        absolute_paths.add(absolute_path)
+
+    results = []
    for path in absolute_paths:
-        if any(path.startswith(exclude) for exclude in exclude_prefixes):
+        if any(path.startswith(exclude_prefix) for exclude_prefix in exclude_prefixes):
            continue
-        if prevent_outside and not path.startswith(base_url):
-            continue
-        res.append(path)
-    return res
+
+        if prevent_outside:
+            parsed_path = urlparse(path)
+
+            if parsed_base_url.netloc != parsed_path.netloc:
+                continue
+
+            # Will take care of verifying rest of path after netloc
+            # if it's more specific
+            if not path.startswith(base_url_to_use):
+                continue
+
+        results.append(path)
+    return results
--- a/libs/core/tests/unit_tests/utils/test_html.py
+++ b/libs/core/tests/unit_tests/utils/test_html.py
@ -156,3 +156,30 @@ def test_extract_sub_links_exclude() -> None:
        )
    )
    assert actual == expected
+
+
+def test_prevent_outside() -> None:
+    """Test that prevent outside compares against full base URL."""
+    html = (
+        '<a href="https://foobar.comic.com">BAD</a>'
+        '<a href="https://foobar.comic:9999">BAD</a>'
+        '<a href="https://foobar.com:9999">BAD</a>'
+        '<a href="http://foobar.com:9999/">BAD</a>'
+        '<a href="https://foobar.com/OK">OK</a>'
+        '<a href="http://foobar.com/BAD">BAD</a>'  # Change in scheme is not OK here
+    )
+
+    expected = sorted(
+        [
+            "https://foobar.com/OK",
+        ]
+    )
+    actual = sorted(
+        extract_sub_links(
+            html,
+            "https://foobar.com/hello/bill.html",
+            base_url="https://foobar.com",
+            prevent_outside=True,
+        )
+    )
+    assert actual == expected