sep base url and loaded url in sub link extraction (#10895)

This commit is contained in:
Bagatur 2023-09-21 08:47:41 -07:00 committed by GitHub
parent 24cb5cd379
commit d37ce48e60
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 38 additions and 8 deletions

View File

@ -145,7 +145,8 @@ class RecursiveUrlLoader(BaseLoader):
# Store the visited links and recursively visit the children # Store the visited links and recursively visit the children
sub_links = extract_sub_links( sub_links = extract_sub_links(
response.text, response.text,
self.url, url,
base_url=self.url,
pattern=self.link_regex, pattern=self.link_regex,
prevent_outside=self.prevent_outside, prevent_outside=self.prevent_outside,
) )
@ -224,7 +225,8 @@ class RecursiveUrlLoader(BaseLoader):
if depth < self.max_depth - 1: if depth < self.max_depth - 1:
sub_links = extract_sub_links( sub_links = extract_sub_links(
text, text,
self.url, url,
base_url=self.url,
pattern=self.link_regex, pattern=self.link_regex,
prevent_outside=self.prevent_outside, prevent_outside=self.prevent_outside,
) )

View File

@ -1,5 +1,5 @@
import re import re
from typing import List, Union from typing import List, Optional, Union
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
@ -37,16 +37,18 @@ def find_all_links(
def extract_sub_links( def extract_sub_links(
raw_html: str, raw_html: str,
base_url: str, url: str,
*, *,
base_url: Optional[str] = None,
pattern: Union[str, re.Pattern, None] = None, pattern: Union[str, re.Pattern, None] = None,
prevent_outside: bool = True, prevent_outside: bool = True,
) -> List[str]: ) -> List[str]:
"""Extract all links from a raw html string and convert into absolute paths. """Extract all links from a raw html string and convert into absolute paths.
Args: Args:
raw_html: original html raw_html: original html.
base_url: the base url of the html url: the url of the html.
base_url: the base url to check for outside links against.
pattern: Regex to use for extracting links from raw html. pattern: Regex to use for extracting links from raw html.
prevent_outside: If True, ignore external links which are not children prevent_outside: If True, ignore external links which are not children
of the base url. of the base url.
@ -54,6 +56,7 @@ def extract_sub_links(
Returns: Returns:
List[str]: sub links List[str]: sub links
""" """
base_url = base_url if base_url is not None else url
all_links = find_all_links(raw_html, pattern=pattern) all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set() absolute_paths = set()
for link in all_links: for link in all_links:
@ -62,9 +65,9 @@ def extract_sub_links(
absolute_paths.add(link) absolute_paths.add(link)
# Some may have omitted the protocol like //to/path # Some may have omitted the protocol like //to/path
elif link.startswith("//"): elif link.startswith("//"):
absolute_paths.add(f"{urlparse(base_url).scheme}:{link}") absolute_paths.add(f"{urlparse(url).scheme}:{link}")
else: else:
absolute_paths.add(urljoin(base_url, link)) absolute_paths.add(urljoin(url, link))
if prevent_outside: if prevent_outside:
return [p for p in absolute_paths if p.startswith(base_url)] return [p for p in absolute_paths if p.startswith(base_url)]
return list(absolute_paths) return list(absolute_paths)

View File

@ -102,3 +102,28 @@ def test_extract_sub_links() -> None:
] ]
) )
assert actual == expected assert actual == expected
def test_extract_sub_links_base() -> None:
html = (
'<a href="https://foobar.com">one</a>'
'<a href="http://baz.net">two</a>'
'<a href="//foobar.com/hello">three</a>'
'<a href="/how/are/you/doing">four</a>'
'<a href="alexis.html"</a>'
)
expected = sorted(
[
"https://foobar.com",
"https://foobar.com/hello",
"https://foobar.com/how/are/you/doing",
"https://foobar.com/hello/alexis.html",
]
)
actual = sorted(
extract_sub_links(
html, "https://foobar.com/hello/bill.html", base_url="https://foobar.com"
)
)
assert actual == expected