skip excluded sublinks before recursion (#11036)

This commit is contained in:
Bagatur 2023-09-26 02:24:54 -07:00 committed by GitHub
parent 9c5eca92e4
commit a2f7246f0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 6 deletions

View File

@ -99,6 +99,13 @@ class RecursiveUrlLoader(BaseLoader):
else _metadata_extractor else _metadata_extractor
) )
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else () self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
raise ValueError(
f"Base url is included in exclude_dirs. Received base_url: {url} and "
f"exclude_dirs: {self.exclude_dirs}"
)
self.timeout = timeout self.timeout = timeout
self.prevent_outside = prevent_outside if prevent_outside is not None else True self.prevent_outside = prevent_outside if prevent_outside is not None else True
self.link_regex = link_regex self.link_regex = link_regex
@ -149,6 +156,7 @@ class RecursiveUrlLoader(BaseLoader):
base_url=self.url, base_url=self.url,
pattern=self.link_regex, pattern=self.link_regex,
prevent_outside=self.prevent_outside, prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs,
) )
for link in sub_links: for link in sub_links:
# Check all unvisited links # Check all unvisited links
@ -182,10 +190,6 @@ class RecursiveUrlLoader(BaseLoader):
if depth >= self.max_depth: if depth >= self.max_depth:
return [] return []
# Exclude the root and parent from a list
# Exclude the links that start with any of the excluded directories
if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
return []
# Disable SSL verification because websites may have invalid SSL certificates, # Disable SSL verification because websites may have invalid SSL certificates,
# but won't cause any security issues for us. # but won't cause any security issues for us.
close_session = session is None close_session = session is None

View File

@ -1,5 +1,5 @@
import re import re
from typing import List, Optional, Union from typing import List, Optional, Sequence, Union
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
@ -42,6 +42,7 @@ def extract_sub_links(
base_url: Optional[str] = None, base_url: Optional[str] = None,
pattern: Union[str, re.Pattern, None] = None, pattern: Union[str, re.Pattern, None] = None,
prevent_outside: bool = True, prevent_outside: bool = True,
exclude_prefixes: Sequence[str] = (),
) -> List[str]: ) -> List[str]:
"""Extract all links from a raw html string and convert into absolute paths. """Extract all links from a raw html string and convert into absolute paths.
@ -52,6 +53,7 @@ def extract_sub_links(
pattern: Regex to use for extracting links from raw html. pattern: Regex to use for extracting links from raw html.
prevent_outside: If True, ignore external links which are not children prevent_outside: If True, ignore external links which are not children
of the base url. of the base url.
exclude_prefixes: Exclude any URLs that start with one of these prefixes.
Returns: Returns:
List[str]: sub links List[str]: sub links
@ -60,8 +62,10 @@ def extract_sub_links(
all_links = find_all_links(raw_html, pattern=pattern) all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set() absolute_paths = set()
for link in all_links: for link in all_links:
if any(link.startswith(exclude) for exclude in exclude_prefixes):
continue
# Some may be absolute links like https://to/path # Some may be absolute links like https://to/path
if link.startswith("http"): elif link.startswith("http"):
absolute_paths.add(link) absolute_paths.add(link)
# Some may have omitted the protocol like //to/path # Some may have omitted the protocol like //to/path
elif link.startswith("//"): elif link.startswith("//"):