mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Fixed handling of absolute URLs in RecursiveUrlLoader
(#7677)
<!-- Thank you for contributing to LangChain! Replace this comment with: - Description: - Issue: the issue # it fixes (if applicable), - Dependencies: any dependencies required for this change, - Tag maintainer: for a quicker response, tag the relevant maintainer (see below), - Twitter handle: we announce bigger features on Twitter. If your PR gets announced and you'd like a mention, we'll gladly shout you out! If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. Maintainer responsibilities: - General / Misc / if you don't know who to tag: @baskaryan - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev - Models / Prompts: @hwchase17, @baskaryan - Memory: @hwchase17 - Agents / Tools / Toolkits: @hinthornw - Tracing / Callbacks: @agola11 - Async: @agola11 If no one reviews your PR within a few days, feel free to @-mention the same people again. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md --> ## Description This PR addresses a bug in the RecursiveUrlLoader class where absolute URLs were being treated as relative URLs, causing malformed URLs to be produced. The fix involves using the urljoin function from the urllib.parse module to correctly handle both absolute and relative URLs. @rlancemartin @eyurtsev --------- Co-authored-by: Lance Martin <lance@langchain.dev>
This commit is contained in:
parent
c087ce74f7
commit
9124221d31
@ -1,5 +1,5 @@
|
|||||||
from typing import Iterator, List, Optional, Set
|
from typing import Iterator, List, Optional, Set
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@ -73,10 +73,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Get absolute path for all root relative links listed
|
# Get absolute path for all root relative links listed
|
||||||
absolute_paths = [
|
absolute_paths = [urljoin(base_url, link) for link in child_links]
|
||||||
f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}{link}"
|
|
||||||
for link in child_links
|
|
||||||
]
|
|
||||||
|
|
||||||
# Store the visited links and recursively visit the children
|
# Store the visited links and recursively visit the children
|
||||||
for link in absolute_paths:
|
for link in absolute_paths:
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
from typing import Any, Callable
|
||||||
|
from unittest.mock import MagicMock, Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pytest import MonkeyPatch
|
||||||
|
|
||||||
|
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def url_loader() -> RecursiveUrlLoader:
|
||||||
|
url = "http://test.com"
|
||||||
|
exclude_dir = "/exclude" # Note: Changed from list to single string
|
||||||
|
return RecursiveUrlLoader(url, exclude_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_requests_get(monkeypatch: MonkeyPatch) -> None:
|
||||||
|
"""Mock requests.get"""
|
||||||
|
|
||||||
|
# Mocking HTML content with 2 links, one absolute, one relative.
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<a href="/relative">relative link</a>
|
||||||
|
<a href="http://test.com/absolute">absolute link</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Mock Response object for main URL
|
||||||
|
mock_response_main = MagicMock()
|
||||||
|
mock_response_main.text = html_content
|
||||||
|
|
||||||
|
# Mock Response object for relative URL
|
||||||
|
mock_response_relative = MagicMock()
|
||||||
|
mock_response_relative.text = "Relative page"
|
||||||
|
|
||||||
|
# Mock Response object for absolute URL
|
||||||
|
mock_response_absolute = MagicMock()
|
||||||
|
mock_response_absolute.text = "Absolute page"
|
||||||
|
|
||||||
|
# Mock Response object for default
|
||||||
|
mock_response_default = MagicMock()
|
||||||
|
mock_response_default.text = "Default page"
|
||||||
|
|
||||||
|
def mock_get(url: str, *args: Any, **kwargs: Any) -> Mock:
|
||||||
|
if url.startswith("http://test.com"):
|
||||||
|
if "/absolute" in url:
|
||||||
|
return mock_response_absolute
|
||||||
|
elif "/relative" in url:
|
||||||
|
return mock_response_relative
|
||||||
|
else:
|
||||||
|
return mock_response_main
|
||||||
|
return mock_response_default
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"langchain.document_loaders.recursive_url_loader.requests.get", mock_get
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_child_links_recursive(
|
||||||
|
url_loader: RecursiveUrlLoader, mock_requests_get: Callable[[], None]
|
||||||
|
) -> None:
|
||||||
|
# Testing for both relative and absolute URL
|
||||||
|
child_links = url_loader.get_child_links_recursive("http://test.com")
|
||||||
|
|
||||||
|
assert child_links == {
|
||||||
|
"http://test.com/relative",
|
||||||
|
"http://test.com/absolute",
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user