Fixed handling of absolute URLs in RecursiveUrlLoader (#7677)

<!-- Thank you for contributing to LangChain!

Replace this comment with:
  - Description:
  - Issue: the issue # it fixes (if applicable),
  - Dependencies: any dependencies required for this change,
- Tag maintainer: for a quicker response, tag the relevant maintainer
(see below),
- Twitter handle: we announce bigger features on Twitter. If your PR
gets announced and you'd like a mention, we'll gladly shout you out!

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
  2. an example notebook showing its use.

Maintainer responsibilities:
  - General / Misc / if you don't know who to tag: @baskaryan
  - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
  - Models / Prompts: @hwchase17, @baskaryan
  - Memory: @hwchase17
  - Agents / Tools / Toolkits: @hinthornw
  - Tracing / Callbacks: @agola11
  - Async: @agola11

If no one reviews your PR within a few days, feel free to @-mention the
same people again.

See contribution guidelines for more information on how to write/run
tests, lint, etc:
https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md
 -->

## Description
This PR addresses a bug in the RecursiveUrlLoader class where absolute
URLs were being treated as relative URLs, causing malformed URLs to be
produced. The fix involves using the urljoin function from the
urllib.parse module to correctly handle both absolute and relative URLs.

@rlancemartin @eyurtsev

---------

Co-authored-by: Lance Martin <lance@langchain.dev>
This commit is contained in:
Kenton Parton 2023-07-13 23:34:00 +01:00 committed by GitHub
parent c087ce74f7
commit 9124221d31
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 73 additions and 5 deletions

View File

@ -1,5 +1,5 @@
from typing import Iterator, List, Optional, Set from typing import Iterator, List, Optional, Set
from urllib.parse import urlparse from urllib.parse import urljoin, urlparse
import requests import requests
@ -73,10 +73,7 @@ class RecursiveUrlLoader(BaseLoader):
) )
# Get absolute path for all root relative links listed # Get absolute path for all root relative links listed
absolute_paths = [ absolute_paths = [urljoin(base_url, link) for link in child_links]
f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}{link}"
for link in child_links
]
# Store the visited links and recursively visit the children # Store the visited links and recursively visit the children
for link in absolute_paths: for link in absolute_paths:

View File

@ -0,0 +1,71 @@
from typing import Any, Callable
from unittest.mock import MagicMock, Mock
import pytest
from pytest import MonkeyPatch
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
@pytest.fixture
def url_loader() -> RecursiveUrlLoader:
url = "http://test.com"
exclude_dir = "/exclude" # Note: Changed from list to single string
return RecursiveUrlLoader(url, exclude_dir)
@pytest.fixture
def mock_requests_get(monkeypatch: MonkeyPatch) -> None:
"""Mock requests.get"""
# Mocking HTML content with 2 links, one absolute, one relative.
html_content = """
<html>
<body>
<a href="/relative">relative link</a>
<a href="http://test.com/absolute">absolute link</a>
</body>
</html>
"""
# Mock Response object for main URL
mock_response_main = MagicMock()
mock_response_main.text = html_content
# Mock Response object for relative URL
mock_response_relative = MagicMock()
mock_response_relative.text = "Relative page"
# Mock Response object for absolute URL
mock_response_absolute = MagicMock()
mock_response_absolute.text = "Absolute page"
# Mock Response object for default
mock_response_default = MagicMock()
mock_response_default.text = "Default page"
def mock_get(url: str, *args: Any, **kwargs: Any) -> Mock:
if url.startswith("http://test.com"):
if "/absolute" in url:
return mock_response_absolute
elif "/relative" in url:
return mock_response_relative
else:
return mock_response_main
return mock_response_default
monkeypatch.setattr(
"langchain.document_loaders.recursive_url_loader.requests.get", mock_get
)
def test_get_child_links_recursive(
url_loader: RecursiveUrlLoader, mock_requests_get: Callable[[], None]
) -> None:
# Testing for both relative and absolute URL
child_links = url_loader.get_child_links_recursive("http://test.com")
assert child_links == {
"http://test.com/relative",
"http://test.com/absolute",
}