From a2f7246f0eeafff29b63c90d45ec9e9eb6c21812 Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Tue, 26 Sep 2023 02:24:54 -0700
Subject: [PATCH] skip excluded sublinks before recursion (#11036)

---
 .../document_loaders/recursive_url_loader.py         | 12 ++++++++----
 libs/langchain/langchain/utils/html.py               |  8 ++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
index 4781609ac0..5bb2350c5c 100644
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@@ -99,6 +99,13 @@ class RecursiveUrlLoader(BaseLoader):
             else _metadata_extractor
         )
         self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
+
+        if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
+            raise ValueError(
+                f"Base url is included in exclude_dirs. Received base_url: {url} and "
+                f"exclude_dirs: {self.exclude_dirs}"
+            )
+
         self.timeout = timeout
         self.prevent_outside = prevent_outside if prevent_outside is not None else True
         self.link_regex = link_regex
@@ -149,6 +156,7 @@ class RecursiveUrlLoader(BaseLoader):
             base_url=self.url,
             pattern=self.link_regex,
             prevent_outside=self.prevent_outside,
+            exclude_prefixes=self.exclude_dirs,
         )
         for link in sub_links:
             # Check all unvisited links
@@ -182,10 +190,6 @@ class RecursiveUrlLoader(BaseLoader):
         if depth >= self.max_depth:
             return []
 
-        # Exclude the root and parent from a list
-        # Exclude the links that start with any of the excluded directories
-        if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
-            return []
         # Disable SSL verification because websites may have invalid SSL certificates,
         # but won't cause any security issues for us.
         close_session = session is None
diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py
index d1f76cdabd..d981b1dc7a 100644
--- a/libs/langchain/langchain/utils/html.py
+++ b/libs/langchain/langchain/utils/html.py
@@ -1,5 +1,5 @@
 import re
-from typing import List, Optional, Union
+from typing import List, Optional, Sequence, Union
 from urllib.parse import urljoin, urlparse
 
 PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
@@ -42,6 +42,7 @@ def extract_sub_links(
     base_url: Optional[str] = None,
     pattern: Union[str, re.Pattern, None] = None,
     prevent_outside: bool = True,
+    exclude_prefixes: Sequence[str] = (),
 ) -> List[str]:
     """Extract all links from a raw html string and convert into absolute paths.
 
@@ -52,6 +53,7 @@ def extract_sub_links(
         pattern: Regex to use for extracting links from raw html.
         prevent_outside: If True, ignore external links which are not children
             of the base url.
+        exclude_prefixes: Exclude any URLs that start with one of these prefixes.
 
     Returns:
         List[str]: sub links
@@ -60,8 +62,10 @@ def extract_sub_links(
     all_links = find_all_links(raw_html, pattern=pattern)
     absolute_paths = set()
     for link in all_links:
+        if any(link.startswith(exclude) for exclude in exclude_prefixes):
+            continue
         # Some may be absolute links like https://to/path
-        if link.startswith("http"):
+        elif link.startswith("http"):
             absolute_paths.add(link)
         # Some may have omitted the protocol like //to/path
         elif link.startswith("//"):