Fix: Sitemap Document Loader Tests and Documentation (#11866)

**Description:**
While working on the Docusaurus site loader #9138, I noticed some
outdated docs and tests for the Sitemap Loader.

**Issue:** 
This is tangentially related to #6691 in reference to doc links. I plan
on digging in to a few of these issue when I find time next.
pull/11789/head
Lee 12 months ago committed by GitHub
parent 8bb8c56f74
commit e669f9d731
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because one or more lines are too long

@ -11,7 +11,7 @@ def test_sitemap() -> None:
loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml") loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
documents = loader.load() documents = loader.load()
assert len(documents) > 1 assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content assert "LangChain Python API" in documents[0].page_content
def test_sitemap_block() -> None: def test_sitemap_block() -> None:
@ -21,7 +21,7 @@ def test_sitemap_block() -> None:
) )
documents = loader.load() documents = loader.load()
assert len(documents) == 1 assert len(documents) == 1
assert "🦜🔗" in documents[0].page_content assert "LangChain Python API" in documents[0].page_content
def test_sitemap_block_only_one() -> None: def test_sitemap_block_only_one() -> None:
@ -31,7 +31,7 @@ def test_sitemap_block_only_one() -> None:
) )
documents = loader.load() documents = loader.load()
assert len(documents) > 1 assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content assert "LangChain Python API" in documents[0].page_content
def test_sitemap_block_blocknum_default() -> None: def test_sitemap_block_blocknum_default() -> None:
@ -41,7 +41,7 @@ def test_sitemap_block_blocknum_default() -> None:
) )
documents = loader.load() documents = loader.load()
assert len(documents) > 1 assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content assert "LangChain Python API" in documents[0].page_content
def test_sitemap_block_size_to_small() -> None: def test_sitemap_block_size_to_small() -> None:
@ -76,11 +76,11 @@ def test_filter_sitemap() -> None:
"""Test sitemap loader.""" """Test sitemap loader."""
loader = SitemapLoader( loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml", "https://langchain.readthedocs.io/sitemap.xml",
filter_urls=["https://python.langchain.com/en/stable/"], filter_urls=["https://api.python.langchain.com/en/stable/"],
) )
documents = loader.load() documents = loader.load()
assert len(documents) == 1 assert len(documents) == 1
assert "🦜🔗" in documents[0].page_content assert "LangChain Python API" in documents[0].page_content
def test_sitemap_metadata() -> None: def test_sitemap_metadata() -> None:
@ -128,7 +128,7 @@ def test_sitemap_metadata_default() -> None:
def test_local_sitemap() -> None: def test_local_sitemap() -> None:
"""Test sitemap loader.""" """Test sitemap loader."""
file_path = Path(__file__).parent.parent / "examples/sitemap.xml" file_path = Path(__file__).parent.parent / "examples/sitemap.xml"
loader = SitemapLoader(str(file_path)) loader = SitemapLoader(str(file_path), is_local=True)
documents = loader.load() documents = loader.load()
assert len(documents) > 1 assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content assert "🦜🔗" in documents[0].page_content

@ -1,35 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xhtml="http://www.w3.org/1999/xhtml"> xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url> <url>
<loc>https://python.langchain.com/en/stable/</loc> <loc>https://python.langchain.com/en/stable/</loc>
<lastmod>2023-05-04T16:15:31.377584+00:00</lastmod> <lastmod>2023-05-04T16:15:31.377584+00:00</lastmod>
<changefreq>weekly</changefreq> <changefreq>weekly</changefreq>
<priority>1</priority> <priority>1</priority>
</url> </url>
<url> <url>
<loc>https://python.langchain.com/en/latest/</loc> <loc>https://python.langchain.com/en/latest/</loc>
<lastmod>2023-05-05T07:52:19.633878+00:00</lastmod> <lastmod>2023-05-05T07:52:19.633878+00:00</lastmod>
<changefreq>daily</changefreq> <changefreq>daily</changefreq>
<priority>0.9</priority> <priority>0.9</priority>
</url> </url>
<url> <url>
<loc>https://python.langchain.com/en/harrison-docs-refactor-3-24/</loc> <loc>https://python.langchain.com/en/harrison-docs-refactor-3-24/</loc>
<lastmod>2023-03-27T02:32:55.132916+00:00</lastmod> <lastmod>2023-03-27T02:32:55.132916+00:00</lastmod>
<changefreq>monthly</changefreq> <changefreq>monthly</changefreq>
<priority>0.8</priority> <priority>0.8</priority>
</url> </url>
</urlset> </urlset>
Loading…
Cancel
Save