diff --git a/libs/langchain/langchain/document_loaders/recursive_url_loader.py b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
index a3fdbbcfcc..60ee27013e 100644
--- a/libs/langchain/langchain/document_loaders/recursive_url_loader.py
+++ b/libs/langchain/langchain/document_loaders/recursive_url_loader.py
@@ -126,9 +126,6 @@ class RecursiveUrlLoader(BaseLoader):
 
         if depth >= self.max_depth:
             return
-        # Exclude the links that start with any of the excluded directories
-        if any(url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs):
-            return
 
         # Get all links that can be accessed from the current URL
         visited.add(url)
diff --git a/libs/langchain/langchain/utils/html.py b/libs/langchain/langchain/utils/html.py
index d981b1dc7a..09a76876d1 100644
--- a/libs/langchain/langchain/utils/html.py
+++ b/libs/langchain/langchain/utils/html.py
@@ -62,16 +62,19 @@ def extract_sub_links(
     all_links = find_all_links(raw_html, pattern=pattern)
     absolute_paths = set()
     for link in all_links:
-        if any(link.startswith(exclude) for exclude in exclude_prefixes):
-            continue
         # Some may be absolute links like https://to/path
-        elif link.startswith("http"):
+        if link.startswith("http"):
             absolute_paths.add(link)
         # Some may have omitted the protocol like //to/path
         elif link.startswith("//"):
             absolute_paths.add(f"{urlparse(url).scheme}:{link}")
         else:
             absolute_paths.add(urljoin(url, link))
-    if prevent_outside:
-        return [p for p in absolute_paths if p.startswith(base_url)]
-    return list(absolute_paths)
+    res = []
+    for path in absolute_paths:
+        if any(path.startswith(exclude) for exclude in exclude_prefixes):
+            continue
+        if prevent_outside and not path.startswith(base_url):
+            continue
+        res.append(path)
+    return res
diff --git a/libs/langchain/tests/unit_tests/utils/test_html.py b/libs/langchain/tests/unit_tests/utils/test_html.py
index b961f966d9..692eae5865 100644
--- a/libs/langchain/tests/unit_tests/utils/test_html.py
+++ b/libs/langchain/tests/unit_tests/utils/test_html.py
@@ -127,3 +127,32 @@ def test_extract_sub_links_base() -> None:
         )
     )
     assert actual == expected
+
+
+def test_extract_sub_links_exclude() -> None:
+    html = (
+        '<a href="https://foobar.com">one</a>'
+        '<a href="http://baz.net">two</a>'
+        '<a href="//foobar.com/hello">three</a>'
+        '<a href="/how/are/you/doing">four</a>'
+        '<a href="alexis.html"</a>'
+    )
+
+    expected = sorted(
+        [
+            "http://baz.net",
+            "https://foobar.com",
+            "https://foobar.com/hello",
+            "https://foobar.com/hello/alexis.html",
+        ]
+    )
+    actual = sorted(
+        extract_sub_links(
+            html,
+            "https://foobar.com/hello/bill.html",
+            base_url="https://foobar.com",
+            prevent_outside=False,
+            exclude_prefixes=("https://foobar.com/how", "http://baz.org"),
+        )
+    )
+    assert actual == expected