extract sublinks exclude by abs path (#11079)

2024-11-04 06:00:26 +00:00 · 2023-09-26 12:26:27 -07:00 · 2023-09-26 12:26:27 -07:00 · d85339b9f2
commit d85339b9f2
parent 7ee8b2d1bf
3 changed files with 38 additions and 9 deletions
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@ -126,9 +126,6 @@ class RecursiveUrlLoader(BaseLoader):
        if depth >= self.max_depth:
            return
        # Exclude the links that start with any of the excluded directories
        if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
            return
        # Get all links that can be accessed from the current URL
        visited.add(url)
--- a/libs/langchain/langchain/utils/html.py
+++ b/libs/langchain/langchain/utils/html.py
@ -62,16 +62,19 @@ def extract_sub_links(
    all_links = find_all_links(raw_html, pattern=pattern)
    absolute_paths = set()
    for link in all_links:
        if any(link.startswith(exclude) for exclude in exclude_prefixes):
            continue
        # Some may be absolute links like https://to/path
-        elif link.startswith("http"):
+        if link.startswith("http"):
            absolute_paths.add(link)
        # Some may have omitted the protocol like //to/path
        elif link.startswith("//"):
            absolute_paths.add(f"{urlparse(url).scheme}:{link}")
        else:
            absolute_paths.add(urljoin(url, link))
-    if prevent_outside:
+    res = []
-        return [p for p in absolute_paths if p.startswith(base_url)]
+    for path in absolute_paths:
-    return list(absolute_paths)
+        if any(path.startswith(exclude) for exclude in exclude_prefixes):
            continue
        if prevent_outside and not path.startswith(base_url):
            continue
        res.append(path)
    return res
--- a/libs/langchain/tests/unit_tests/utils/test_html.py
+++ b/libs/langchain/tests/unit_tests/utils/test_html.py
@ -127,3 +127,32 @@ def test_extract_sub_links_base() -> None:
        )
    )
    assert actual == expected
 def test_extract_sub_links_exclude() -> None:
    html = (
        '<a href="https://foobar.com">one</a>'
        '<a href="http://baz.net">two</a>'
        '<a href="//foobar.com/hello">three</a>'
        '<a href="/how/are/you/doing">four</a>'
        '<a href="alexis.html"</a>'
    )
    expected = sorted(
        [
            "http://baz.net",
            "https://foobar.com",
            "https://foobar.com/hello",
            "https://foobar.com/hello/alexis.html",
        ]
    )
    actual = sorted(
        extract_sub_links(
            html,
            "https://foobar.com/hello/bill.html",
            base_url="https://foobar.com",
            prevent_outside=False,
            exclude_prefixes=("https://foobar.com/how", "http://baz.org"),
        )
    )
    assert actual == expected