diff --git a/docs/modules/indexes/document_loaders/examples/example_data/sitemap.xml b/docs/modules/indexes/document_loaders/examples/example_data/sitemap.xml
new file mode 100644
index 00000000..6ca2636e
--- /dev/null
+++ b/docs/modules/indexes/document_loaders/examples/example_data/sitemap.xml
@@ -0,0 +1,35 @@
+
+
+
+
+ https://python.langchain.com/en/stable/
+
+
+ 2023-05-04T16:15:31.377584+00:00
+
+ weekly
+ 1
+
+
+
+ https://python.langchain.com/en/latest/
+
+
+ 2023-05-05T07:52:19.633878+00:00
+
+ daily
+ 0.9
+
+
+
+ https://python.langchain.com/en/harrison-docs-refactor-3-24/
+
+
+ 2023-03-27T02:32:55.132916+00:00
+
+ monthly
+ 0.8
+
+
+
diff --git a/docs/modules/indexes/document_loaders/examples/sitemap.ipynb b/docs/modules/indexes/document_loaders/examples/sitemap.ipynb
index 46a4d0bd..97a3b7af 100644
--- a/docs/modules/indexes/document_loaders/examples/sitemap.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/sitemap.ipynb
@@ -108,7 +108,9 @@
{
"cell_type": "code",
"execution_count": 14,
- "metadata": {},
+ "metadata": {
+ "scrolled": true
+ },
"outputs": [
{
"data": {
@@ -125,6 +127,34 @@
"documents[0]"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Local Sitemap\n",
+ "\n",
+ "The sitemap loader can also be used to load local files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Fetching pages: 100%|####################################################################################################################################| 3/3 [00:00<00:00, 3.91it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "sitemap_loader = SitemapLoader(web_path=\"example_data/sitemap.xml\", is_local=True)\n",
+ "\n",
+ "docs = sitemap_loader.load()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
@@ -149,7 +179,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.9.1"
}
},
"nbformat": 4,
diff --git a/langchain/document_loaders/sitemap.py b/langchain/document_loaders/sitemap.py
index 7e3d3e41..826692a1 100644
--- a/langchain/document_loaders/sitemap.py
+++ b/langchain/document_loaders/sitemap.py
@@ -32,11 +32,12 @@ class SitemapLoader(WebBaseLoader):
blocksize: Optional[int] = None,
blocknum: int = 0,
meta_function: Optional[Callable] = None,
+ is_local: bool = False,
):
"""Initialize with webpage path and optional filter URLs.
Args:
- web_path: url of the sitemap
+ web_path: url of the sitemap. can also be a local path
filter_urls: list of strings or regexes that will be applied to filter the
urls that are parsed and loaded
parsing_function: Function to parse bs4.Soup output
@@ -45,6 +46,7 @@ class SitemapLoader(WebBaseLoader):
meta_function: Function to parse bs4.Soup output for metadata
remember when setting this method to also copy metadata["loc"]
to metadata["source"] if you are using this field
+ is_local: whether the sitemap is a local file
"""
if blocksize is not None and blocksize < 1:
@@ -67,6 +69,7 @@ class SitemapLoader(WebBaseLoader):
self.meta_function = meta_function or _default_meta_function
self.blocksize = blocksize
self.blocknum = blocknum
+ self.is_local = is_local
def parse_sitemap(self, soup: Any) -> List[dict]:
"""Parse sitemap xml and load into a list of dicts."""
@@ -100,7 +103,17 @@ class SitemapLoader(WebBaseLoader):
def load(self) -> List[Document]:
"""Load sitemap."""
- soup = self.scrape("xml")
+ if self.is_local:
+ try:
+ import bs4
+ except ImportError:
+ raise ValueError(
+ "bs4 package not found, please install it with " "`pip install bs4`"
+ )
+ fp = open(self.web_path)
+ soup = bs4.BeautifulSoup(fp, "xml")
+ else:
+ soup = self.scrape("xml")
els = self.parse_sitemap(soup)
diff --git a/tests/integration_tests/document_loaders/test_sitemap.py b/tests/integration_tests/document_loaders/test_sitemap.py
index b5cb98f3..4581c845 100644
--- a/tests/integration_tests/document_loaders/test_sitemap.py
+++ b/tests/integration_tests/document_loaders/test_sitemap.py
@@ -1,3 +1,4 @@
+from pathlib import Path
from typing import Any
import pytest
@@ -122,3 +123,12 @@ def test_sitemap_metadata_default() -> None:
assert len(documents) > 1
assert "source" in documents[0].metadata
assert "loc" in documents[0].metadata
+
+
+def test_local_sitemap() -> None:
+ """Test sitemap loader."""
+ file_path = Path(__file__).parent.parent / "examples/sitemap.xml"
+ loader = SitemapLoader(str(file_path))
+ documents = loader.load()
+ assert len(documents) > 1
+ assert "🦜🔗" in documents[0].page_content
diff --git a/tests/integration_tests/examples/sitemap.xml b/tests/integration_tests/examples/sitemap.xml
new file mode 100644
index 00000000..6ca2636e
--- /dev/null
+++ b/tests/integration_tests/examples/sitemap.xml
@@ -0,0 +1,35 @@
+
+
+
+
+ https://python.langchain.com/en/stable/
+
+
+ 2023-05-04T16:15:31.377584+00:00
+
+ weekly
+ 1
+
+
+
+ https://python.langchain.com/en/latest/
+
+
+ 2023-05-05T07:52:19.633878+00:00
+
+ daily
+ 0.9
+
+
+
+ https://python.langchain.com/en/harrison-docs-refactor-3-24/
+
+
+ 2023-03-27T02:32:55.132916+00:00
+
+ monthly
+ 0.8
+
+
+