diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py index 8839b4a943..ebdd7b86ba 100644 --- a/libs/langchain/langchain/utils/html.py +++ b/libs/langchain/langchain/utils/html.py @@ -59,11 +59,12 @@ def extract_sub_links( for link in all_links: # Some may be absolute links like https://to/path if link.startswith("http"): - if not prevent_outside or link.startswith(base_url): - absolute_paths.add(link) + absolute_paths.add(link) # Some may have omitted the protocol like //to/path elif link.startswith("//"): absolute_paths.add(f"{urlparse(base_url).scheme}:{link}") else: absolute_paths.add(urljoin(base_url, link)) + if prevent_outside: + return [p for p in absolute_paths if p.startswith(base_url)] return list(absolute_paths) diff --git a/libs/langchain/tests/unit_tests/utils/test_html.py b/libs/langchain/tests/unit_tests/utils/test_html.py index a5c42b6a34..eaaa3544e8 100644 --- a/libs/langchain/tests/unit_tests/utils/test_html.py +++ b/libs/langchain/tests/unit_tests/utils/test_html.py @@ -86,13 +86,8 @@ def test_extract_sub_links() -> None: actual = sorted(extract_sub_links(html, "https://foobar.com")) assert actual == expected - actual = sorted(extract_sub_links(html, "https://foobar.com/hello")) - expected = sorted( - [ - "https://foobar.com/hello", - "https://foobar.com/how/are/you/doing", - ] - ) + actual = extract_sub_links(html, "https://foobar.com/hello") + expected = ["https://foobar.com/hello"] assert actual == expected actual = sorted(