From a2f7246f0eeafff29b63c90d45ec9e9eb6c21812 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Tue, 26 Sep 2023 02:24:54 -0700 Subject: [PATCH] skip excluded sublinks before recursion (#11036) --- .../document_loaders/recursive_url_loader.py | 12 ++++++++---- libs/langchain/langchain/utils/html.py | 8 ++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py index 4781609ac0..5bb2350c5c 100644 --- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py +++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py @@ -99,6 +99,13 @@ class RecursiveUrlLoader(BaseLoader): else _metadata_extractor ) self.exclude_dirs = exclude_dirs if exclude_dirs is not None else () + + if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs): + raise ValueError( + f"Base url is included in exclude_dirs. Received base_url: {url} and " + f"exclude_dirs: {self.exclude_dirs}" + ) + self.timeout = timeout self.prevent_outside = prevent_outside if prevent_outside is not None else True self.link_regex = link_regex @@ -149,6 +156,7 @@ class RecursiveUrlLoader(BaseLoader): base_url=self.url, pattern=self.link_regex, prevent_outside=self.prevent_outside, + exclude_prefixes=self.exclude_dirs, ) for link in sub_links: # Check all unvisited links @@ -182,10 +190,6 @@ class RecursiveUrlLoader(BaseLoader): if depth >= self.max_depth: return [] - # Exclude the root and parent from a list - # Exclude the links that start with any of the excluded directories - if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs): - return [] # Disable SSL verification because websites may have invalid SSL certificates, # but won't cause any security issues for us. close_session = session is None diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py index d1f76cdabd..d981b1dc7a 100644 --- a/libs/langchain/langchain/utils/html.py +++ b/libs/langchain/langchain/utils/html.py @@ -1,5 +1,5 @@ import re -from typing import List, Optional, Union +from typing import List, Optional, Sequence, Union from urllib.parse import urljoin, urlparse PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") @@ -42,6 +42,7 @@ def extract_sub_links( base_url: Optional[str] = None, pattern: Union[str, re.Pattern, None] = None, prevent_outside: bool = True, + exclude_prefixes: Sequence[str] = (), ) -> List[str]: """Extract all links from a raw html string and convert into absolute paths. @@ -52,6 +53,7 @@ def extract_sub_links( pattern: Regex to use for extracting links from raw html. prevent_outside: If True, ignore external links which are not children of the base url. + exclude_prefixes: Exclude any URLs that start with one of these prefixes. Returns: List[str]: sub links @@ -60,8 +62,10 @@ def extract_sub_links( all_links = find_all_links(raw_html, pattern=pattern) absolute_paths = set() for link in all_links: + if any(link.startswith(exclude) for exclude in exclude_prefixes): + continue # Some may be absolute links like https://to/path - if link.startswith("http"): + elif link.startswith("http"): absolute_paths.add(link) # Some may have omitted the protocol like //to/path elif link.startswith("//"):