diff --git a/langchain/document_loaders/recursive_url_loader.py b/langchain/document_loaders/recursive_url_loader.py index 7462d85888..f7438c5d10 100644 --- a/langchain/document_loaders/recursive_url_loader.py +++ b/langchain/document_loaders/recursive_url_loader.py @@ -1,5 +1,5 @@ from typing import Iterator, List, Optional, Set -from urllib.parse import urlparse +from urllib.parse import urljoin, urlparse import requests @@ -73,10 +73,7 @@ class RecursiveUrlLoader(BaseLoader): ) # Get absolute path for all root relative links listed - absolute_paths = [ - f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}{link}" - for link in child_links - ] + absolute_paths = [urljoin(base_url, link) for link in child_links] # Store the visited links and recursively visit the children for link in absolute_paths: diff --git a/tests/unit_tests/document_loaders/test_recursive_url_loader.py b/tests/unit_tests/document_loaders/test_recursive_url_loader.py new file mode 100644 index 0000000000..315cc88b51 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_recursive_url_loader.py @@ -0,0 +1,71 @@ +from typing import Any, Callable +from unittest.mock import MagicMock, Mock + +import pytest +from pytest import MonkeyPatch + +from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader + + +@pytest.fixture +def url_loader() -> RecursiveUrlLoader: + url = "http://test.com" + exclude_dir = "/exclude" # Note: Changed from list to single string + return RecursiveUrlLoader(url, exclude_dir) + + +@pytest.fixture +def mock_requests_get(monkeypatch: MonkeyPatch) -> None: + """Mock requests.get""" + + # Mocking HTML content with 2 links, one absolute, one relative. + html_content = """ + + + relative link + absolute link + + + """ + + # Mock Response object for main URL + mock_response_main = MagicMock() + mock_response_main.text = html_content + + # Mock Response object for relative URL + mock_response_relative = MagicMock() + mock_response_relative.text = "Relative page" + + # Mock Response object for absolute URL + mock_response_absolute = MagicMock() + mock_response_absolute.text = "Absolute page" + + # Mock Response object for default + mock_response_default = MagicMock() + mock_response_default.text = "Default page" + + def mock_get(url: str, *args: Any, **kwargs: Any) -> Mock: + if url.startswith("http://test.com"): + if "/absolute" in url: + return mock_response_absolute + elif "/relative" in url: + return mock_response_relative + else: + return mock_response_main + return mock_response_default + + monkeypatch.setattr( + "langchain.document_loaders.recursive_url_loader.requests.get", mock_get + ) + + +def test_get_child_links_recursive( + url_loader: RecursiveUrlLoader, mock_requests_get: Callable[[], None] +) -> None: + # Testing for both relative and absolute URL + child_links = url_loader.get_child_links_recursive("http://test.com") + + assert child_links == { + "http://test.com/relative", + "http://test.com/absolute", + }