fix extract sublink bug (#10855)

pull/10856/head
Bagatur 11 months ago committed by GitHub
parent 7dec2d399b
commit de0a02f507
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -59,11 +59,12 @@ def extract_sub_links(
for link in all_links:
# Some may be absolute links like https://to/path
if link.startswith("http"):
if not prevent_outside or link.startswith(base_url):
absolute_paths.add(link)
absolute_paths.add(link)
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
absolute_paths.add(f"{urlparse(base_url).scheme}:{link}")
else:
absolute_paths.add(urljoin(base_url, link))
if prevent_outside:
return [p for p in absolute_paths if p.startswith(base_url)]
return list(absolute_paths)

@ -86,13 +86,8 @@ def test_extract_sub_links() -> None:
actual = sorted(extract_sub_links(html, "https://foobar.com"))
assert actual == expected
actual = sorted(extract_sub_links(html, "https://foobar.com/hello"))
expected = sorted(
[
"https://foobar.com/hello",
"https://foobar.com/how/are/you/doing",
]
)
actual = extract_sub_links(html, "https://foobar.com/hello")
expected = ["https://foobar.com/hello"]
assert actual == expected
actual = sorted(

Loading…
Cancel
Save