From b6e3ac17c419acb246e08bf9cd46f3ee0cf2909d Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 14 May 2023 22:04:38 -0700 Subject: [PATCH] Harrison/sitemap local (#4704) Co-authored-by: Lukas Bauer --- .../examples/example_data/sitemap.xml | 35 +++++++++++++++++++ .../document_loaders/examples/sitemap.ipynb | 34 ++++++++++++++++-- langchain/document_loaders/sitemap.py | 17 +++++++-- .../document_loaders/test_sitemap.py | 10 ++++++ tests/integration_tests/examples/sitemap.xml | 35 +++++++++++++++++++ 5 files changed, 127 insertions(+), 4 deletions(-) create mode 100644 docs/modules/indexes/document_loaders/examples/example_data/sitemap.xml create mode 100644 tests/integration_tests/examples/sitemap.xml diff --git a/docs/modules/indexes/document_loaders/examples/example_data/sitemap.xml b/docs/modules/indexes/document_loaders/examples/example_data/sitemap.xml new file mode 100644 index 0000000000..6ca2636e43 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/example_data/sitemap.xml @@ -0,0 +1,35 @@ + + + + + https://python.langchain.com/en/stable/ + + + 2023-05-04T16:15:31.377584+00:00 + + weekly + 1 + + + + https://python.langchain.com/en/latest/ + + + 2023-05-05T07:52:19.633878+00:00 + + daily + 0.9 + + + + https://python.langchain.com/en/harrison-docs-refactor-3-24/ + + + 2023-03-27T02:32:55.132916+00:00 + + monthly + 0.8 + + + diff --git a/docs/modules/indexes/document_loaders/examples/sitemap.ipynb b/docs/modules/indexes/document_loaders/examples/sitemap.ipynb index 46a4d0bd09..97a3b7afb9 100644 --- a/docs/modules/indexes/document_loaders/examples/sitemap.ipynb +++ b/docs/modules/indexes/document_loaders/examples/sitemap.ipynb @@ -108,7 +108,9 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -125,6 +127,34 @@ "documents[0]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Local Sitemap\n", + "\n", + "The sitemap loader can also be used to load local files." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching pages: 100%|####################################################################################################################################| 3/3 [00:00<00:00, 3.91it/s]\n" + ] + } + ], + "source": [ + "sitemap_loader = SitemapLoader(web_path=\"example_data/sitemap.xml\", is_local=True)\n", + "\n", + "docs = sitemap_loader.load()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -149,7 +179,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/langchain/document_loaders/sitemap.py b/langchain/document_loaders/sitemap.py index 7e3d3e416a..826692a190 100644 --- a/langchain/document_loaders/sitemap.py +++ b/langchain/document_loaders/sitemap.py @@ -32,11 +32,12 @@ class SitemapLoader(WebBaseLoader): blocksize: Optional[int] = None, blocknum: int = 0, meta_function: Optional[Callable] = None, + is_local: bool = False, ): """Initialize with webpage path and optional filter URLs. Args: - web_path: url of the sitemap + web_path: url of the sitemap. can also be a local path filter_urls: list of strings or regexes that will be applied to filter the urls that are parsed and loaded parsing_function: Function to parse bs4.Soup output @@ -45,6 +46,7 @@ class SitemapLoader(WebBaseLoader): meta_function: Function to parse bs4.Soup output for metadata remember when setting this method to also copy metadata["loc"] to metadata["source"] if you are using this field + is_local: whether the sitemap is a local file """ if blocksize is not None and blocksize < 1: @@ -67,6 +69,7 @@ class SitemapLoader(WebBaseLoader): self.meta_function = meta_function or _default_meta_function self.blocksize = blocksize self.blocknum = blocknum + self.is_local = is_local def parse_sitemap(self, soup: Any) -> List[dict]: """Parse sitemap xml and load into a list of dicts.""" @@ -100,7 +103,17 @@ class SitemapLoader(WebBaseLoader): def load(self) -> List[Document]: """Load sitemap.""" - soup = self.scrape("xml") + if self.is_local: + try: + import bs4 + except ImportError: + raise ValueError( + "bs4 package not found, please install it with " "`pip install bs4`" + ) + fp = open(self.web_path) + soup = bs4.BeautifulSoup(fp, "xml") + else: + soup = self.scrape("xml") els = self.parse_sitemap(soup) diff --git a/tests/integration_tests/document_loaders/test_sitemap.py b/tests/integration_tests/document_loaders/test_sitemap.py index b5cb98f3a5..4581c8456e 100644 --- a/tests/integration_tests/document_loaders/test_sitemap.py +++ b/tests/integration_tests/document_loaders/test_sitemap.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Any import pytest @@ -122,3 +123,12 @@ def test_sitemap_metadata_default() -> None: assert len(documents) > 1 assert "source" in documents[0].metadata assert "loc" in documents[0].metadata + + +def test_local_sitemap() -> None: + """Test sitemap loader.""" + file_path = Path(__file__).parent.parent / "examples/sitemap.xml" + loader = SitemapLoader(str(file_path)) + documents = loader.load() + assert len(documents) > 1 + assert "🦜🔗" in documents[0].page_content diff --git a/tests/integration_tests/examples/sitemap.xml b/tests/integration_tests/examples/sitemap.xml new file mode 100644 index 0000000000..6ca2636e43 --- /dev/null +++ b/tests/integration_tests/examples/sitemap.xml @@ -0,0 +1,35 @@ + + + + + https://python.langchain.com/en/stable/ + + + 2023-05-04T16:15:31.377584+00:00 + + weekly + 1 + + + + https://python.langchain.com/en/latest/ + + + 2023-05-05T07:52:19.633878+00:00 + + daily + 0.9 + + + + https://python.langchain.com/en/harrison-docs-refactor-3-24/ + + + 2023-03-27T02:32:55.132916+00:00 + + monthly + 0.8 + + +