mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
sep base url and loaded url in sub link extraction (#10895)
This commit is contained in:
parent
24cb5cd379
commit
d37ce48e60
@ -145,7 +145,8 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
# Store the visited links and recursively visit the children
|
# Store the visited links and recursively visit the children
|
||||||
sub_links = extract_sub_links(
|
sub_links = extract_sub_links(
|
||||||
response.text,
|
response.text,
|
||||||
self.url,
|
url,
|
||||||
|
base_url=self.url,
|
||||||
pattern=self.link_regex,
|
pattern=self.link_regex,
|
||||||
prevent_outside=self.prevent_outside,
|
prevent_outside=self.prevent_outside,
|
||||||
)
|
)
|
||||||
@ -224,7 +225,8 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
if depth < self.max_depth - 1:
|
if depth < self.max_depth - 1:
|
||||||
sub_links = extract_sub_links(
|
sub_links = extract_sub_links(
|
||||||
text,
|
text,
|
||||||
self.url,
|
url,
|
||||||
|
base_url=self.url,
|
||||||
pattern=self.link_regex,
|
pattern=self.link_regex,
|
||||||
prevent_outside=self.prevent_outside,
|
prevent_outside=self.prevent_outside,
|
||||||
)
|
)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
from typing import List, Union
|
from typing import List, Optional, Union
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
|
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
|
||||||
@ -37,16 +37,18 @@ def find_all_links(
|
|||||||
|
|
||||||
def extract_sub_links(
|
def extract_sub_links(
|
||||||
raw_html: str,
|
raw_html: str,
|
||||||
base_url: str,
|
url: str,
|
||||||
*,
|
*,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
pattern: Union[str, re.Pattern, None] = None,
|
pattern: Union[str, re.Pattern, None] = None,
|
||||||
prevent_outside: bool = True,
|
prevent_outside: bool = True,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Extract all links from a raw html string and convert into absolute paths.
|
"""Extract all links from a raw html string and convert into absolute paths.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
raw_html: original html
|
raw_html: original html.
|
||||||
base_url: the base url of the html
|
url: the url of the html.
|
||||||
|
base_url: the base url to check for outside links against.
|
||||||
pattern: Regex to use for extracting links from raw html.
|
pattern: Regex to use for extracting links from raw html.
|
||||||
prevent_outside: If True, ignore external links which are not children
|
prevent_outside: If True, ignore external links which are not children
|
||||||
of the base url.
|
of the base url.
|
||||||
@ -54,6 +56,7 @@ def extract_sub_links(
|
|||||||
Returns:
|
Returns:
|
||||||
List[str]: sub links
|
List[str]: sub links
|
||||||
"""
|
"""
|
||||||
|
base_url = base_url if base_url is not None else url
|
||||||
all_links = find_all_links(raw_html, pattern=pattern)
|
all_links = find_all_links(raw_html, pattern=pattern)
|
||||||
absolute_paths = set()
|
absolute_paths = set()
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
@ -62,9 +65,9 @@ def extract_sub_links(
|
|||||||
absolute_paths.add(link)
|
absolute_paths.add(link)
|
||||||
# Some may have omitted the protocol like //to/path
|
# Some may have omitted the protocol like //to/path
|
||||||
elif link.startswith("//"):
|
elif link.startswith("//"):
|
||||||
absolute_paths.add(f"{urlparse(base_url).scheme}:{link}")
|
absolute_paths.add(f"{urlparse(url).scheme}:{link}")
|
||||||
else:
|
else:
|
||||||
absolute_paths.add(urljoin(base_url, link))
|
absolute_paths.add(urljoin(url, link))
|
||||||
if prevent_outside:
|
if prevent_outside:
|
||||||
return [p for p in absolute_paths if p.startswith(base_url)]
|
return [p for p in absolute_paths if p.startswith(base_url)]
|
||||||
return list(absolute_paths)
|
return list(absolute_paths)
|
||||||
|
@ -102,3 +102,28 @@ def test_extract_sub_links() -> None:
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
assert actual == expected
|
assert actual == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_sub_links_base() -> None:
|
||||||
|
html = (
|
||||||
|
'<a href="https://foobar.com">one</a>'
|
||||||
|
'<a href="http://baz.net">two</a>'
|
||||||
|
'<a href="//foobar.com/hello">three</a>'
|
||||||
|
'<a href="/how/are/you/doing">four</a>'
|
||||||
|
'<a href="alexis.html"</a>'
|
||||||
|
)
|
||||||
|
|
||||||
|
expected = sorted(
|
||||||
|
[
|
||||||
|
"https://foobar.com",
|
||||||
|
"https://foobar.com/hello",
|
||||||
|
"https://foobar.com/how/are/you/doing",
|
||||||
|
"https://foobar.com/hello/alexis.html",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
actual = sorted(
|
||||||
|
extract_sub_links(
|
||||||
|
html, "https://foobar.com/hello/bill.html", base_url="https://foobar.com"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert actual == expected
|
||||||
|
Loading…
Reference in New Issue
Block a user