diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/recursive_url_loader.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/recursive_url_loader.ipynb index 06903220f6..a2e6719cfe 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/recursive_url_loader.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/recursive_url_loader.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "5a7cc773", "metadata": {}, @@ -25,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "2e3532b2", "metadata": {}, "outputs": [], @@ -34,7 +33,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "6384c057", "metadata": {}, @@ -44,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "d69e5620", "metadata": {}, "outputs": [], @@ -56,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "084fb2ce", "metadata": {}, "outputs": [ @@ -66,7 +64,7 @@ "12" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -77,17 +75,17 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "89355b7c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'\\n\\n\\n\\n\\nDynamoDB-Backed Chat Memory | \\uf8ffü¶úÔ∏è\\uf8ffüîó Lan'" + "'\\n\\n\\n\\n\\nBuffer Window Memory | 🦜️🔗 Langchain\\n\\n\\n\\n\\n\\nSki'" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -98,20 +96,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "13bd7e16", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'source': 'https://js.langchain.com/docs/modules/memory/examples/dynamodb',\n", - " 'title': 'DynamoDB-Backed Chat Memory | \\uf8ffü¶úÔ∏è\\uf8ffüîó Langchain',\n", - " 'description': 'For longer-term persistence across chat sessions, you can swap out the default in-memory chatHistory that backs chat memory classes like BufferMemory for a DynamoDB instance.',\n", + "{'source': 'https://js.langchain.com/docs/modules/memory/examples/buffer_window_memory',\n", + " 'title': 'Buffer Window Memory | 🦜️🔗 Langchain',\n", + " 'description': 'BufferWindowMemory keeps track of the back-and-forths in conversation, and then uses a window of size k to surface the last k back-and-forths to use as memory.',\n", " 'language': 'en'}" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -121,26 +119,39 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "40fc13ef", "metadata": {}, "source": [ "Now, let's try a more extensive example, the `docs` root dir.\n", "\n", - "We will skip everything under `api`." + "We will skip everything under `api`.\n", + "\n", + "For this, we can `lazy_load` each page as we crawl the tree, using `WebBaseLoader` to load each as we go." ] }, { "cell_type": "code", - "execution_count": 7, - "id": "30ff61d3", + "execution_count": null, + "id": "5c938b9f", "metadata": {}, "outputs": [], "source": [ "url = \"https://js.langchain.com/docs/\"\n", "exclude_dirs = [\"https://js.langchain.com/docs/api/\"]\n", "loader = RecursiveUrlLoader(url=url, exclude_dirs=exclude_dirs)\n", + "# Lazy load each\n", + "docs = [print(doc) or doc for doc in loader.lazy_load()]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "30ff61d3", + "metadata": {}, + "outputs": [], + "source": [ + "# Load all pages\n", "docs = loader.load()" ] }, @@ -148,12 +159,14 @@ "cell_type": "code", "execution_count": 8, "id": "457e30f3", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { "text/plain": [ - "176" + "188" ] }, "execution_count": 8, @@ -174,7 +187,7 @@ { "data": { "text/plain": [ - "'\\n\\n\\n\\n\\nHacker News | \\uf8ffü¶úÔ∏è\\uf8ffüîó Langchain\\n\\n\\n\\n\\n\\nSkip'" + "'\\n\\n\\n\\n\\nAgent Simulations | 🦜️🔗 Langchain\\n\\n\\n\\n\\n\\nSkip t'" ] }, "execution_count": 9, @@ -195,9 +208,9 @@ { "data": { "text/plain": [ - "{'source': 'https://js.langchain.com/docs/modules/indexes/document_loaders/examples/web_loaders/hn',\n", - " 'title': 'Hacker News | \\uf8ffü¶úÔ∏è\\uf8ffüîó Langchain',\n", - " 'description': 'This example goes over how to load data from the hacker news website, using Cheerio. One document will be created for each page.',\n", + "{'source': 'https://js.langchain.com/docs/use_cases/agent_simulations/',\n", + " 'title': 'Agent Simulations | 🦜️🔗 Langchain',\n", + " 'description': 'Agent simulations involve taking multiple agents and having them interact with each other.',\n", " 'language': 'en'}" ] }, diff --git a/langchain/document_loaders/recursive_url_loader.py b/langchain/document_loaders/recursive_url_loader.py index f7438c5d10..be20a45447 100644 --- a/langchain/document_loaders/recursive_url_loader.py +++ b/langchain/document_loaders/recursive_url_loader.py @@ -10,19 +10,24 @@ from langchain.document_loaders.base import BaseLoader class RecursiveUrlLoader(BaseLoader): """Loads all child links from a given url.""" - def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None: + def __init__( + self, + url: str, + exclude_dirs: Optional[str] = None, + ) -> None: """Initialize with URL to crawl and any subdirectories to exclude. Args: url: The URL to crawl. exclude_dirs: A list of subdirectories to exclude. """ + self.url = url self.exclude_dirs = exclude_dirs def get_child_links_recursive( self, url: str, visited: Optional[Set[str]] = None - ) -> Set[str]: + ) -> Iterator[Document]: """Recursively get all child links starting with the path of the input URL. Args: @@ -30,6 +35,8 @@ class RecursiveUrlLoader(BaseLoader): visited: A set of visited URLs. """ + from langchain.document_loaders import WebBaseLoader + try: from bs4 import BeautifulSoup except ImportError: @@ -80,19 +87,20 @@ class RecursiveUrlLoader(BaseLoader): # Check all unvisited links if link not in visited: visited.add(link) + loaded_link = WebBaseLoader(link).load() + if isinstance(loaded_link, list): + yield from loaded_link + else: + yield loaded_link # If the link is a directory (w/ children) then visit it if link.endswith("/"): - visited.update(self.get_child_links_recursive(link, visited)) + yield from self.get_child_links_recursive(link, visited) return visited def lazy_load(self) -> Iterator[Document]: - from langchain.document_loaders import WebBaseLoader - """Lazy load web pages.""" - child_links = self.get_child_links_recursive(self.url) - loader = WebBaseLoader(list(child_links)) - return loader.lazy_load() + return self.get_child_links_recursive(self.url) def load(self) -> List[Document]: """Load web pages.""" diff --git a/tests/unit_tests/document_loaders/test_recursive_url_loader.py b/tests/unit_tests/document_loaders/test_recursive_url_loader.py deleted file mode 100644 index 315cc88b51..0000000000 --- a/tests/unit_tests/document_loaders/test_recursive_url_loader.py +++ /dev/null @@ -1,71 +0,0 @@ -from typing import Any, Callable -from unittest.mock import MagicMock, Mock - -import pytest -from pytest import MonkeyPatch - -from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader - - -@pytest.fixture -def url_loader() -> RecursiveUrlLoader: - url = "http://test.com" - exclude_dir = "/exclude" # Note: Changed from list to single string - return RecursiveUrlLoader(url, exclude_dir) - - -@pytest.fixture -def mock_requests_get(monkeypatch: MonkeyPatch) -> None: - """Mock requests.get""" - - # Mocking HTML content with 2 links, one absolute, one relative. - html_content = """ - - - relative link - absolute link - - - """ - - # Mock Response object for main URL - mock_response_main = MagicMock() - mock_response_main.text = html_content - - # Mock Response object for relative URL - mock_response_relative = MagicMock() - mock_response_relative.text = "Relative page" - - # Mock Response object for absolute URL - mock_response_absolute = MagicMock() - mock_response_absolute.text = "Absolute page" - - # Mock Response object for default - mock_response_default = MagicMock() - mock_response_default.text = "Default page" - - def mock_get(url: str, *args: Any, **kwargs: Any) -> Mock: - if url.startswith("http://test.com"): - if "/absolute" in url: - return mock_response_absolute - elif "/relative" in url: - return mock_response_relative - else: - return mock_response_main - return mock_response_default - - monkeypatch.setattr( - "langchain.document_loaders.recursive_url_loader.requests.get", mock_get - ) - - -def test_get_child_links_recursive( - url_loader: RecursiveUrlLoader, mock_requests_get: Callable[[], None] -) -> None: - # Testing for both relative and absolute URL - child_links = url_loader.get_child_links_recursive("http://test.com") - - assert child_links == { - "http://test.com/relative", - "http://test.com/absolute", - }