skip excluded sublinks before recursion (#11036)

pull/11066/head
Bagatur 11 months ago committed by GitHub
parent 9c5eca92e4
commit a2f7246f0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -99,6 +99,13 @@ class RecursiveUrlLoader(BaseLoader):
else _metadata_extractor
)
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
raise ValueError(
f"Base url is included in exclude_dirs. Received base_url: {url} and "
f"exclude_dirs: {self.exclude_dirs}"
)
self.timeout = timeout
self.prevent_outside = prevent_outside if prevent_outside is not None else True
self.link_regex = link_regex
@ -149,6 +156,7 @@ class RecursiveUrlLoader(BaseLoader):
base_url=self.url,
pattern=self.link_regex,
prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs,
)
for link in sub_links:
# Check all unvisited links
@ -182,10 +190,6 @@ class RecursiveUrlLoader(BaseLoader):
if depth >= self.max_depth:
return []
# Exclude the root and parent from a list
# Exclude the links that start with any of the excluded directories
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
return []
# Disable SSL verification because websites may have invalid SSL certificates,
# but won't cause any security issues for us.
close_session = session is None

@ -1,5 +1,5 @@
import re
from typing import List, Optional, Union
from typing import List, Optional, Sequence, Union
from urllib.parse import urljoin, urlparse
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
@ -42,6 +42,7 @@ def extract_sub_links(
base_url: Optional[str] = None,
pattern: Union[str, re.Pattern, None] = None,
prevent_outside: bool = True,
exclude_prefixes: Sequence[str] = (),
) -> List[str]:
"""Extract all links from a raw html string and convert into absolute paths.
@ -52,6 +53,7 @@ def extract_sub_links(
pattern: Regex to use for extracting links from raw html.
prevent_outside: If True, ignore external links which are not children
of the base url.
exclude_prefixes: Exclude any URLs that start with one of these prefixes.
Returns:
List[str]: sub links
@ -60,8 +62,10 @@ def extract_sub_links(
all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set()
for link in all_links:
if any(link.startswith(exclude) for exclude in exclude_prefixes):
continue
# Some may be absolute links like https://to/path
if link.startswith("http"):
elif link.startswith("http"):
absolute_paths.add(link)
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):

Loading…
Cancel
Save