2023-11-10 22:21:55 +00:00
|
|
|
from pathlib import Path
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
from langchain_community.document_loaders import DocusaurusLoader
|
2023-11-10 22:21:55 +00:00
|
|
|
|
|
|
|
DOCS_URL = str(Path(__file__).parent.parent / "examples/docusaurus-sitemap.xml")
|
|
|
|
|
|
|
|
|
|
|
|
def test_docusarus() -> None:
|
|
|
|
"""Test sitemap loader."""
|
|
|
|
loader = DocusaurusLoader(DOCS_URL, is_local=True)
|
|
|
|
documents = loader.load()
|
|
|
|
assert len(documents) > 1
|
|
|
|
assert "🦜️🔗 Langchain" in documents[0].page_content
|
|
|
|
|
|
|
|
|
|
|
|
def test_filter_docusaurus_sitemap() -> None:
|
|
|
|
"""Test sitemap loader."""
|
|
|
|
loader = DocusaurusLoader(
|
|
|
|
DOCS_URL,
|
|
|
|
is_local=True,
|
|
|
|
filter_urls=[
|
|
|
|
"https://python.langchain.com/docs/integrations/document_loaders/sitemap"
|
|
|
|
],
|
|
|
|
)
|
|
|
|
documents = loader.load()
|
|
|
|
assert len(documents) == 1
|
|
|
|
assert "SitemapLoader" in documents[0].page_content
|
|
|
|
|
|
|
|
|
|
|
|
def test_docusarus_metadata() -> None:
|
|
|
|
def sitemap_metadata_one(meta: dict, _content: None) -> dict:
|
|
|
|
return {**meta, "mykey": "Super Important Metadata"}
|
|
|
|
|
|
|
|
"""Test sitemap loader."""
|
|
|
|
loader = DocusaurusLoader(
|
|
|
|
DOCS_URL,
|
|
|
|
is_local=True,
|
|
|
|
meta_function=sitemap_metadata_one,
|
|
|
|
)
|
|
|
|
documents = loader.load()
|
|
|
|
assert len(documents) > 1
|
|
|
|
assert "mykey" in documents[0].metadata
|
|
|
|
assert "Super Important Metadata" in documents[0].metadata["mykey"]
|