mirror of https://github.com/hwchase17/langchain
Make recursive loader yield while crawling (#7568)
Support actual lazy_load since it can take a while to crawl larger directories.pull/7694/head
parent
82f3e32d8d
commit
6325a3517c
@ -1,71 +0,0 @@
|
|||||||
from typing import Any, Callable
|
|
||||||
from unittest.mock import MagicMock, Mock
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from pytest import MonkeyPatch
|
|
||||||
|
|
||||||
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def url_loader() -> RecursiveUrlLoader:
|
|
||||||
url = "http://test.com"
|
|
||||||
exclude_dir = "/exclude" # Note: Changed from list to single string
|
|
||||||
return RecursiveUrlLoader(url, exclude_dir)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_requests_get(monkeypatch: MonkeyPatch) -> None:
|
|
||||||
"""Mock requests.get"""
|
|
||||||
|
|
||||||
# Mocking HTML content with 2 links, one absolute, one relative.
|
|
||||||
html_content = """
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<a href="/relative">relative link</a>
|
|
||||||
<a href="http://test.com/absolute">absolute link</a>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Mock Response object for main URL
|
|
||||||
mock_response_main = MagicMock()
|
|
||||||
mock_response_main.text = html_content
|
|
||||||
|
|
||||||
# Mock Response object for relative URL
|
|
||||||
mock_response_relative = MagicMock()
|
|
||||||
mock_response_relative.text = "Relative page"
|
|
||||||
|
|
||||||
# Mock Response object for absolute URL
|
|
||||||
mock_response_absolute = MagicMock()
|
|
||||||
mock_response_absolute.text = "Absolute page"
|
|
||||||
|
|
||||||
# Mock Response object for default
|
|
||||||
mock_response_default = MagicMock()
|
|
||||||
mock_response_default.text = "Default page"
|
|
||||||
|
|
||||||
def mock_get(url: str, *args: Any, **kwargs: Any) -> Mock:
|
|
||||||
if url.startswith("http://test.com"):
|
|
||||||
if "/absolute" in url:
|
|
||||||
return mock_response_absolute
|
|
||||||
elif "/relative" in url:
|
|
||||||
return mock_response_relative
|
|
||||||
else:
|
|
||||||
return mock_response_main
|
|
||||||
return mock_response_default
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"langchain.document_loaders.recursive_url_loader.requests.get", mock_get
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_child_links_recursive(
|
|
||||||
url_loader: RecursiveUrlLoader, mock_requests_get: Callable[[], None]
|
|
||||||
) -> None:
|
|
||||||
# Testing for both relative and absolute URL
|
|
||||||
child_links = url_loader.get_child_links_recursive("http://test.com")
|
|
||||||
|
|
||||||
assert child_links == {
|
|
||||||
"http://test.com/relative",
|
|
||||||
"http://test.com/absolute",
|
|
||||||
}
|
|
Loading…
Reference in New Issue