Fix: Sitemap Document Loader Tests and Documentation (#11866)

**Description:**
While working on the Docusaurus site loader #9138, I noticed some
outdated docs and tests for the Sitemap Loader.

**Issue:** 
This is tangentially related to #6691 in reference to doc links. I plan
on digging in to a few of these issue when I find time next.
pull/11789/head
Lee 9 months ago committed by GitHub
parent 8bb8c56f74
commit e669f9d731
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because one or more lines are too long

@ -11,7 +11,7 @@ def test_sitemap() -> None:
loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
documents = loader.load()
assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content
assert "LangChain Python API" in documents[0].page_content
def test_sitemap_block() -> None:
@ -21,7 +21,7 @@ def test_sitemap_block() -> None:
)
documents = loader.load()
assert len(documents) == 1
assert "🦜🔗" in documents[0].page_content
assert "LangChain Python API" in documents[0].page_content
def test_sitemap_block_only_one() -> None:
@ -31,7 +31,7 @@ def test_sitemap_block_only_one() -> None:
)
documents = loader.load()
assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content
assert "LangChain Python API" in documents[0].page_content
def test_sitemap_block_blocknum_default() -> None:
@ -41,7 +41,7 @@ def test_sitemap_block_blocknum_default() -> None:
)
documents = loader.load()
assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content
assert "LangChain Python API" in documents[0].page_content
def test_sitemap_block_size_to_small() -> None:
@ -76,11 +76,11 @@ def test_filter_sitemap() -> None:
"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml",
filter_urls=["https://python.langchain.com/en/stable/"],
filter_urls=["https://api.python.langchain.com/en/stable/"],
)
documents = loader.load()
assert len(documents) == 1
assert "🦜🔗" in documents[0].page_content
assert "LangChain Python API" in documents[0].page_content
def test_sitemap_metadata() -> None:
@ -128,7 +128,7 @@ def test_sitemap_metadata_default() -> None:
def test_local_sitemap() -> None:
"""Test sitemap loader."""
file_path = Path(__file__).parent.parent / "examples/sitemap.xml"
loader = SitemapLoader(str(file_path))
loader = SitemapLoader(str(file_path), is_local=True)
documents = loader.load()
assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content
assert "🦜🔗" in documents[0].page_content

@ -1,35 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xhtml="http://www.w3.org/1999/xhtml">
xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://python.langchain.com/en/stable/</loc>
<lastmod>2023-05-04T16:15:31.377584+00:00</lastmod>
<changefreq>weekly</changefreq>
<priority>1</priority>
</url>
<url>
<loc>https://python.langchain.com/en/latest/</loc>
<lastmod>2023-05-05T07:52:19.633878+00:00</lastmod>
<changefreq>daily</changefreq>
<priority>0.9</priority>
</url>
<url>
<loc>https://python.langchain.com/en/harrison-docs-refactor-3-24/</loc>
<lastmod>2023-03-27T02:32:55.132916+00:00</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
</urlset>
</urlset>
Loading…
Cancel
Save