feat: Docusaurus Loader (#9138)

Added a Docusaurus Loader

Issue: #6353

I had to implement this for working with the Ionic documentation, and
wanted to open this up as a draft to get some guidance on building this
out further. I wasn't sure if having it be a light extension of the
SitemapLoader was in the spirit of a proper feature for the library --
but I'm grateful for the opportunities Langchain has given me and I'd
love to build this out properly for the sake of the community.

Any feedback welcome!
This commit is contained in:
Lee 2023-11-10 17:21:55 -05:00 committed by GitHub
parent 8fa960641a
commit 72ad448daa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 379 additions and 0 deletions

File diff suppressed because one or more lines are too long

View File

@ -67,6 +67,7 @@ from langchain.document_loaders.diffbot import DiffbotLoader
from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.discord import DiscordChatLoader
from langchain.document_loaders.docugami import DocugamiLoader
from langchain.document_loaders.docusaurus import DocusaurusLoader
from langchain.document_loaders.dropbox import DropboxLoader
from langchain.document_loaders.duckdb_loader import DuckDBLoader
from langchain.document_loaders.email import (
@ -250,6 +251,7 @@ __all__ = [
"DirectoryLoader",
"DiscordChatLoader",
"DocugamiLoader",
"DocusaurusLoader",
"Docx2txtLoader",
"DropboxLoader",
"DuckDBLoader",

View File

@ -0,0 +1,49 @@
"""Load Documents from Docusarus Documentation"""
from typing import Any, List, Optional
from langchain.document_loaders.sitemap import SitemapLoader
class DocusaurusLoader(SitemapLoader):
"""
Loader that leverages the SitemapLoader to loop through the generated pages of a
Docusaurus Documentation website and extracts the content by looking for specific
HTML tags. By default, the parser searches for the main content of the Docusaurus
page, which is normally the <article>. You also have the option to define your own
custom HTML tags by providing them as a list, for example: ["div", ".main", "a"].
"""
def __init__(
self,
url: str,
custom_html_tags: Optional[List[str]] = None,
**kwargs: Any,
):
"""
Initialize DocusaurusLoader
Args:
url: The base URL of the Docusaurus website.
custom_html_tags: Optional custom html tags to extract content from pages.
kwargs: Additional args to extend the underlying SitemapLoader, for example:
filter_urls, blocksize, meta_function, is_local, continue_on_failure
"""
if not kwargs.get("is_local"):
url = f"{url}/sitemap.xml"
self.custom_html_tags = custom_html_tags or ["main article"]
super().__init__(
url,
parsing_function=kwargs.get("parsing_function") or self._parsing_function,
**kwargs,
)
def _parsing_function(self, content: Any) -> str:
"""Parses specific elements from a Docusarus page."""
relevant_elements = content.select(",".join(self.custom_html_tags))
for element in relevant_elements:
if element not in relevant_elements:
element.decompose()
return str(content.get_text())

View File

@ -0,0 +1,43 @@
from pathlib import Path
from langchain.document_loaders import DocusaurusLoader
DOCS_URL = str(Path(__file__).parent.parent / "examples/docusaurus-sitemap.xml")
def test_docusarus() -> None:
"""Test sitemap loader."""
loader = DocusaurusLoader(DOCS_URL, is_local=True)
documents = loader.load()
assert len(documents) > 1
assert "🦜️🔗 Langchain" in documents[0].page_content
def test_filter_docusaurus_sitemap() -> None:
"""Test sitemap loader."""
loader = DocusaurusLoader(
DOCS_URL,
is_local=True,
filter_urls=[
"https://python.langchain.com/docs/integrations/document_loaders/sitemap"
],
)
documents = loader.load()
assert len(documents) == 1
assert "SitemapLoader" in documents[0].page_content
def test_docusarus_metadata() -> None:
def sitemap_metadata_one(meta: dict, _content: None) -> dict:
return {**meta, "mykey": "Super Important Metadata"}
"""Test sitemap loader."""
loader = DocusaurusLoader(
DOCS_URL,
is_local=True,
meta_function=sitemap_metadata_one,
)
documents = loader.load()
assert len(documents) > 1
assert "mykey" in documents[0].metadata
assert "Super Important Metadata" in documents[0].metadata["mykey"]

View File

@ -0,0 +1,42 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
xmlns:xhtml="http://www.w3.org/1999/xhtml"
xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"
xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">
<url>
<loc>https://python.langchain.com/docs/integrations/document_loaders/sitemap</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
<url>
<loc>https://python.langchain.com/cookbook</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
<url>
<loc>https://python.langchain.com/docs/additional_resources</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
<url>
<loc>https://python.langchain.com/docs/modules/chains/how_to/</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
<url>
<loc>https://python.langchain.com/docs/use_cases/question_answering/local_retrieval_qa</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
<url>
<loc>https://python.langchain.com/docs/use_cases/summarization</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
<url>
<loc>https://python.langchain.com/</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
</urlset>