From d37ce48e60468f64402bc8c0f8b468fd3677a79c Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Thu, 21 Sep 2023 08:47:41 -0700 Subject: [PATCH] sep base url and loaded url in sub link extraction (#10895) --- .../document_loaders/recursive_url_loader.py | 6 +++-- libs/langchain/langchain/utils/html.py | 15 ++++++----- .../tests/unit_tests/utils/test_html.py | 25 +++++++++++++++++++ 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py index 2b93996a73..4781609ac0 100644 --- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py +++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py @@ -145,7 +145,8 @@ class RecursiveUrlLoader(BaseLoader): # Store the visited links and recursively visit the children sub_links = extract_sub_links( response.text, - self.url, + url, + base_url=self.url, pattern=self.link_regex, prevent_outside=self.prevent_outside, ) @@ -224,7 +225,8 @@ class RecursiveUrlLoader(BaseLoader): if depth < self.max_depth - 1: sub_links = extract_sub_links( text, - self.url, + url, + base_url=self.url, pattern=self.link_regex, prevent_outside=self.prevent_outside, ) diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py index ebdd7b86ba..d1f76cdabd 100644 --- a/libs/langchain/langchain/utils/html.py +++ b/libs/langchain/langchain/utils/html.py @@ -1,5 +1,5 @@ import re -from typing import List, Union +from typing import List, Optional, Union from urllib.parse import urljoin, urlparse PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") @@ -37,16 +37,18 @@ def find_all_links( def extract_sub_links( raw_html: str, - base_url: str, + url: str, *, + base_url: Optional[str] = None, pattern: Union[str, re.Pattern, None] = None, prevent_outside: bool = True, ) -> List[str]: """Extract all links from a raw html string and convert into absolute paths. Args: - raw_html: original html - base_url: the base url of the html + raw_html: original html. + url: the url of the html. + base_url: the base url to check for outside links against. pattern: Regex to use for extracting links from raw html. prevent_outside: If True, ignore external links which are not children of the base url. @@ -54,6 +56,7 @@ def extract_sub_links( Returns: List[str]: sub links """ + base_url = base_url if base_url is not None else url all_links = find_all_links(raw_html, pattern=pattern) absolute_paths = set() for link in all_links: @@ -62,9 +65,9 @@ def extract_sub_links( absolute_paths.add(link) # Some may have omitted the protocol like //to/path elif link.startswith("//"): - absolute_paths.add(f"{urlparse(base_url).scheme}:{link}") + absolute_paths.add(f"{urlparse(url).scheme}:{link}") else: - absolute_paths.add(urljoin(base_url, link)) + absolute_paths.add(urljoin(url, link)) if prevent_outside: return [p for p in absolute_paths if p.startswith(base_url)] return list(absolute_paths) diff --git a/libs/langchain/tests/unit_tests/utils/test_html.py b/libs/langchain/tests/unit_tests/utils/test_html.py index eaaa3544e8..b961f966d9 100644 --- a/libs/langchain/tests/unit_tests/utils/test_html.py +++ b/libs/langchain/tests/unit_tests/utils/test_html.py @@ -102,3 +102,28 @@ def test_extract_sub_links() -> None: ] ) assert actual == expected + + +def test_extract_sub_links_base() -> None: + html = ( + 'one' + 'two' + 'three' + 'four' + '' + ) + + expected = sorted( + [ + "https://foobar.com", + "https://foobar.com/hello", + "https://foobar.com/how/are/you/doing", + "https://foobar.com/hello/alexis.html", + ] + ) + actual = sorted( + extract_sub_links( + html, "https://foobar.com/hello/bill.html", base_url="https://foobar.com" + ) + ) + assert actual == expected