skip excluded sublinks before recursion (#11036)

2024-11-04 06:00:26 +00:00 · 2023-09-26 02:24:54 -07:00 · 2023-09-26 02:24:54 -07:00 · a2f7246f0e
commit a2f7246f0e
parent 9c5eca92e4
2 changed files with 14 additions and 6 deletions
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@ -99,6 +99,13 @@ class RecursiveUrlLoader(BaseLoader):
            else _metadata_extractor
        )
        self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
        if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
            raise ValueError(
                f"Base url is included in exclude_dirs. Received base_url: {url} and "
                f"exclude_dirs: {self.exclude_dirs}"
            )
        self.timeout = timeout
        self.prevent_outside = prevent_outside if prevent_outside is not None else True
        self.link_regex = link_regex
@ -149,6 +156,7 @@ class RecursiveUrlLoader(BaseLoader):
            base_url=self.url,
            pattern=self.link_regex,
            prevent_outside=self.prevent_outside,
            exclude_prefixes=self.exclude_dirs,
        )
        for link in sub_links:
            # Check all unvisited links
@ -182,10 +190,6 @@ class RecursiveUrlLoader(BaseLoader):
        if depth >= self.max_depth:
            return []
        # Exclude the root and parent from a list
        # Exclude the links that start with any of the excluded directories
        if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
            return []
        # Disable SSL verification because websites may have invalid SSL certificates,
        # but won't cause any security issues for us.
        close_session = session is None
--- a/libs/langchain/langchain/utils/html.py
+++ b/libs/langchain/langchain/utils/html.py
@ -1,5 +1,5 @@
 import re
-from typing import List, Optional, Union
+from typing import List, Optional, Sequence, Union
 from urllib.parse import urljoin, urlparse
 PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
@ -42,6 +42,7 @@ def extract_sub_links(
    base_url: Optional[str] = None,
    pattern: Union[str, re.Pattern, None] = None,
    prevent_outside: bool = True,
    exclude_prefixes: Sequence[str] = (),
 ) -> List[str]:
    """Extract all links from a raw html string and convert into absolute paths.
@ -52,6 +53,7 @@ def extract_sub_links(
        pattern: Regex to use for extracting links from raw html.
        prevent_outside: If True, ignore external links which are not children
            of the base url.
        exclude_prefixes: Exclude any URLs that start with one of these prefixes.
    Returns:
        List[str]: sub links
@ -60,8 +62,10 @@ def extract_sub_links(
    all_links = find_all_links(raw_html, pattern=pattern)
    absolute_paths = set()
    for link in all_links:
        if any(link.startswith(exclude) for exclude in exclude_prefixes):
            continue
        # Some may be absolute links like https://to/path
-        if link.startswith("http"):
+        elif link.startswith("http"):
            absolute_paths.add(link)
        # Some may have omitted the protocol like //to/path
        elif link.startswith("//"):