extract sublinks exclude by abs path (#11079)

This commit is contained in:
Bagatur 2023-09-26 12:26:27 -07:00 committed by GitHub
parent 7ee8b2d1bf
commit d85339b9f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 38 additions and 9 deletions

View File

@ -126,9 +126,6 @@ class RecursiveUrlLoader(BaseLoader):
if depth >= self.max_depth: if depth >= self.max_depth:
return return
# Exclude the links that start with any of the excluded directories
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
return
# Get all links that can be accessed from the current URL # Get all links that can be accessed from the current URL
visited.add(url) visited.add(url)

View File

@ -62,16 +62,19 @@ def extract_sub_links(
all_links = find_all_links(raw_html, pattern=pattern) all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set() absolute_paths = set()
for link in all_links: for link in all_links:
if any(link.startswith(exclude) for exclude in exclude_prefixes):
continue
# Some may be absolute links like https://to/path # Some may be absolute links like https://to/path
elif link.startswith("http"): if link.startswith("http"):
absolute_paths.add(link) absolute_paths.add(link)
# Some may have omitted the protocol like //to/path # Some may have omitted the protocol like //to/path
elif link.startswith("//"): elif link.startswith("//"):
absolute_paths.add(f"{urlparse(url).scheme}:{link}") absolute_paths.add(f"{urlparse(url).scheme}:{link}")
else: else:
absolute_paths.add(urljoin(url, link)) absolute_paths.add(urljoin(url, link))
if prevent_outside: res = []
return [p for p in absolute_paths if p.startswith(base_url)] for path in absolute_paths:
return list(absolute_paths) if any(path.startswith(exclude) for exclude in exclude_prefixes):
continue
if prevent_outside and not path.startswith(base_url):
continue
res.append(path)
return res

View File

@ -127,3 +127,32 @@ def test_extract_sub_links_base() -> None:
) )
) )
assert actual == expected assert actual == expected
def test_extract_sub_links_exclude() -> None:
html = (
'<a href="https://foobar.com">one</a>'
'<a href="http://baz.net">two</a>'
'<a href="//foobar.com/hello">three</a>'
'<a href="/how/are/you/doing">four</a>'
'<a href="alexis.html"</a>'
)
expected = sorted(
[
"http://baz.net",
"https://foobar.com",
"https://foobar.com/hello",
"https://foobar.com/hello/alexis.html",
]
)
actual = sorted(
extract_sub_links(
html,
"https://foobar.com/hello/bill.html",
base_url="https://foobar.com",
prevent_outside=False,
exclude_prefixes=("https://foobar.com/how", "http://baz.org"),
)
)
assert actual == expected