diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py index a3fdbbcfcc..60ee27013e 100644 --- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py +++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py @@ -126,9 +126,6 @@ class RecursiveUrlLoader(BaseLoader): if depth >= self.max_depth: return - # Exclude the links that start with any of the excluded directories - if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs): - return # Get all links that can be accessed from the current URL visited.add(url) diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py index d981b1dc7a..09a76876d1 100644 --- a/libs/langchain/langchain/utils/html.py +++ b/libs/langchain/langchain/utils/html.py @@ -62,16 +62,19 @@ def extract_sub_links( all_links = find_all_links(raw_html, pattern=pattern) absolute_paths = set() for link in all_links: - if any(link.startswith(exclude) for exclude in exclude_prefixes): - continue # Some may be absolute links like https://to/path - elif link.startswith("http"): + if link.startswith("http"): absolute_paths.add(link) # Some may have omitted the protocol like //to/path elif link.startswith("//"): absolute_paths.add(f"{urlparse(url).scheme}:{link}") else: absolute_paths.add(urljoin(url, link)) - if prevent_outside: - return [p for p in absolute_paths if p.startswith(base_url)] - return list(absolute_paths) + res = [] + for path in absolute_paths: + if any(path.startswith(exclude) for exclude in exclude_prefixes): + continue + if prevent_outside and not path.startswith(base_url): + continue + res.append(path) + return res diff --git a/libs/langchain/tests/unit_tests/utils/test_html.py b/libs/langchain/tests/unit_tests/utils/test_html.py index b961f966d9..692eae5865 100644 --- a/libs/langchain/tests/unit_tests/utils/test_html.py +++ b/libs/langchain/tests/unit_tests/utils/test_html.py @@ -127,3 +127,32 @@ def test_extract_sub_links_base() -> None: ) ) assert actual == expected + + +def test_extract_sub_links_exclude() -> None: + html = ( + 'one' + 'two' + 'three' + 'four' + '' + ) + + expected = sorted( + [ + "http://baz.net", + "https://foobar.com", + "https://foobar.com/hello", + "https://foobar.com/hello/alexis.html", + ] + ) + actual = sorted( + extract_sub_links( + html, + "https://foobar.com/hello/bill.html", + base_url="https://foobar.com", + prevent_outside=False, + exclude_prefixes=("https://foobar.com/how", "http://baz.org"), + ) + ) + assert actual == expected