mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
skip excluded sublinks before recursion (#11036)
This commit is contained in:
parent
9c5eca92e4
commit
a2f7246f0e
@ -99,6 +99,13 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
else _metadata_extractor
|
else _metadata_extractor
|
||||||
)
|
)
|
||||||
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
|
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
|
||||||
|
|
||||||
|
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
|
||||||
|
raise ValueError(
|
||||||
|
f"Base url is included in exclude_dirs. Received base_url: {url} and "
|
||||||
|
f"exclude_dirs: {self.exclude_dirs}"
|
||||||
|
)
|
||||||
|
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
self.prevent_outside = prevent_outside if prevent_outside is not None else True
|
self.prevent_outside = prevent_outside if prevent_outside is not None else True
|
||||||
self.link_regex = link_regex
|
self.link_regex = link_regex
|
||||||
@ -149,6 +156,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
base_url=self.url,
|
base_url=self.url,
|
||||||
pattern=self.link_regex,
|
pattern=self.link_regex,
|
||||||
prevent_outside=self.prevent_outside,
|
prevent_outside=self.prevent_outside,
|
||||||
|
exclude_prefixes=self.exclude_dirs,
|
||||||
)
|
)
|
||||||
for link in sub_links:
|
for link in sub_links:
|
||||||
# Check all unvisited links
|
# Check all unvisited links
|
||||||
@ -182,10 +190,6 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
if depth >= self.max_depth:
|
if depth >= self.max_depth:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Exclude the root and parent from a list
|
|
||||||
# Exclude the links that start with any of the excluded directories
|
|
||||||
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
|
|
||||||
return []
|
|
||||||
# Disable SSL verification because websites may have invalid SSL certificates,
|
# Disable SSL verification because websites may have invalid SSL certificates,
|
||||||
# but won't cause any security issues for us.
|
# but won't cause any security issues for us.
|
||||||
close_session = session is None
|
close_session = session is None
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Sequence, Union
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
|
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
|
||||||
@ -42,6 +42,7 @@ def extract_sub_links(
|
|||||||
base_url: Optional[str] = None,
|
base_url: Optional[str] = None,
|
||||||
pattern: Union[str, re.Pattern, None] = None,
|
pattern: Union[str, re.Pattern, None] = None,
|
||||||
prevent_outside: bool = True,
|
prevent_outside: bool = True,
|
||||||
|
exclude_prefixes: Sequence[str] = (),
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Extract all links from a raw html string and convert into absolute paths.
|
"""Extract all links from a raw html string and convert into absolute paths.
|
||||||
|
|
||||||
@ -52,6 +53,7 @@ def extract_sub_links(
|
|||||||
pattern: Regex to use for extracting links from raw html.
|
pattern: Regex to use for extracting links from raw html.
|
||||||
prevent_outside: If True, ignore external links which are not children
|
prevent_outside: If True, ignore external links which are not children
|
||||||
of the base url.
|
of the base url.
|
||||||
|
exclude_prefixes: Exclude any URLs that start with one of these prefixes.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[str]: sub links
|
List[str]: sub links
|
||||||
@ -60,8 +62,10 @@ def extract_sub_links(
|
|||||||
all_links = find_all_links(raw_html, pattern=pattern)
|
all_links = find_all_links(raw_html, pattern=pattern)
|
||||||
absolute_paths = set()
|
absolute_paths = set()
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
|
if any(link.startswith(exclude) for exclude in exclude_prefixes):
|
||||||
|
continue
|
||||||
# Some may be absolute links like https://to/path
|
# Some may be absolute links like https://to/path
|
||||||
if link.startswith("http"):
|
elif link.startswith("http"):
|
||||||
absolute_paths.add(link)
|
absolute_paths.add(link)
|
||||||
# Some may have omitted the protocol like //to/path
|
# Some may have omitted the protocol like //to/path
|
||||||
elif link.startswith("//"):
|
elif link.startswith("//"):
|
||||||
|
Loading…
Reference in New Issue
Block a user