mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
extract sublinks exclude by abs path (#11079)
This commit is contained in:
parent
7ee8b2d1bf
commit
d85339b9f2
@ -126,9 +126,6 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
|
|
||||||
if depth >= self.max_depth:
|
if depth >= self.max_depth:
|
||||||
return
|
return
|
||||||
# Exclude the links that start with any of the excluded directories
|
|
||||||
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
|
|
||||||
return
|
|
||||||
|
|
||||||
# Get all links that can be accessed from the current URL
|
# Get all links that can be accessed from the current URL
|
||||||
visited.add(url)
|
visited.add(url)
|
||||||
|
@ -62,16 +62,19 @@ def extract_sub_links(
|
|||||||
all_links = find_all_links(raw_html, pattern=pattern)
|
all_links = find_all_links(raw_html, pattern=pattern)
|
||||||
absolute_paths = set()
|
absolute_paths = set()
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
if any(link.startswith(exclude) for exclude in exclude_prefixes):
|
|
||||||
continue
|
|
||||||
# Some may be absolute links like https://to/path
|
# Some may be absolute links like https://to/path
|
||||||
elif link.startswith("http"):
|
if link.startswith("http"):
|
||||||
absolute_paths.add(link)
|
absolute_paths.add(link)
|
||||||
# Some may have omitted the protocol like //to/path
|
# Some may have omitted the protocol like //to/path
|
||||||
elif link.startswith("//"):
|
elif link.startswith("//"):
|
||||||
absolute_paths.add(f"{urlparse(url).scheme}:{link}")
|
absolute_paths.add(f"{urlparse(url).scheme}:{link}")
|
||||||
else:
|
else:
|
||||||
absolute_paths.add(urljoin(url, link))
|
absolute_paths.add(urljoin(url, link))
|
||||||
if prevent_outside:
|
res = []
|
||||||
return [p for p in absolute_paths if p.startswith(base_url)]
|
for path in absolute_paths:
|
||||||
return list(absolute_paths)
|
if any(path.startswith(exclude) for exclude in exclude_prefixes):
|
||||||
|
continue
|
||||||
|
if prevent_outside and not path.startswith(base_url):
|
||||||
|
continue
|
||||||
|
res.append(path)
|
||||||
|
return res
|
||||||
|
@ -127,3 +127,32 @@ def test_extract_sub_links_base() -> None:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
assert actual == expected
|
assert actual == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_sub_links_exclude() -> None:
|
||||||
|
html = (
|
||||||
|
'<a href="https://foobar.com">one</a>'
|
||||||
|
'<a href="http://baz.net">two</a>'
|
||||||
|
'<a href="//foobar.com/hello">three</a>'
|
||||||
|
'<a href="/how/are/you/doing">four</a>'
|
||||||
|
'<a href="alexis.html"</a>'
|
||||||
|
)
|
||||||
|
|
||||||
|
expected = sorted(
|
||||||
|
[
|
||||||
|
"http://baz.net",
|
||||||
|
"https://foobar.com",
|
||||||
|
"https://foobar.com/hello",
|
||||||
|
"https://foobar.com/hello/alexis.html",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
actual = sorted(
|
||||||
|
extract_sub_links(
|
||||||
|
html,
|
||||||
|
"https://foobar.com/hello/bill.html",
|
||||||
|
base_url="https://foobar.com",
|
||||||
|
prevent_outside=False,
|
||||||
|
exclude_prefixes=("https://foobar.com/how", "http://baz.org"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert actual == expected
|
||||||
|
Loading…
Reference in New Issue
Block a user