mirror of
https://github.com/hwchase17/langchain
synced 2024-11-16 06:13:16 +00:00
feat: Docusaurus Loader (#9138)
Added a Docusaurus Loader Issue: #6353 I had to implement this for working with the Ionic documentation, and wanted to open this up as a draft to get some guidance on building this out further. I wasn't sure if having it be a light extension of the SitemapLoader was in the spirit of a proper feature for the library -- but I'm grateful for the opportunities Langchain has given me and I'd love to build this out properly for the sake of the community. Any feedback welcome!
This commit is contained in:
parent
8fa960641a
commit
72ad448daa
243
docs/docs/integrations/document_loaders/docusaurus.ipynb
Normal file
243
docs/docs/integrations/document_loaders/docusaurus.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -67,6 +67,7 @@ from langchain.document_loaders.diffbot import DiffbotLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.discord import DiscordChatLoader
|
||||
from langchain.document_loaders.docugami import DocugamiLoader
|
||||
from langchain.document_loaders.docusaurus import DocusaurusLoader
|
||||
from langchain.document_loaders.dropbox import DropboxLoader
|
||||
from langchain.document_loaders.duckdb_loader import DuckDBLoader
|
||||
from langchain.document_loaders.email import (
|
||||
@ -250,6 +251,7 @@ __all__ = [
|
||||
"DirectoryLoader",
|
||||
"DiscordChatLoader",
|
||||
"DocugamiLoader",
|
||||
"DocusaurusLoader",
|
||||
"Docx2txtLoader",
|
||||
"DropboxLoader",
|
||||
"DuckDBLoader",
|
||||
|
49
libs/langchain/langchain/document_loaders/docusaurus.py
Normal file
49
libs/langchain/langchain/document_loaders/docusaurus.py
Normal file
@ -0,0 +1,49 @@
|
||||
"""Load Documents from Docusarus Documentation"""
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from langchain.document_loaders.sitemap import SitemapLoader
|
||||
|
||||
|
||||
class DocusaurusLoader(SitemapLoader):
|
||||
"""
|
||||
Loader that leverages the SitemapLoader to loop through the generated pages of a
|
||||
Docusaurus Documentation website and extracts the content by looking for specific
|
||||
HTML tags. By default, the parser searches for the main content of the Docusaurus
|
||||
page, which is normally the <article>. You also have the option to define your own
|
||||
custom HTML tags by providing them as a list, for example: ["div", ".main", "a"].
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
custom_html_tags: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Initialize DocusaurusLoader
|
||||
Args:
|
||||
url: The base URL of the Docusaurus website.
|
||||
custom_html_tags: Optional custom html tags to extract content from pages.
|
||||
kwargs: Additional args to extend the underlying SitemapLoader, for example:
|
||||
filter_urls, blocksize, meta_function, is_local, continue_on_failure
|
||||
"""
|
||||
if not kwargs.get("is_local"):
|
||||
url = f"{url}/sitemap.xml"
|
||||
|
||||
self.custom_html_tags = custom_html_tags or ["main article"]
|
||||
|
||||
super().__init__(
|
||||
url,
|
||||
parsing_function=kwargs.get("parsing_function") or self._parsing_function,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _parsing_function(self, content: Any) -> str:
|
||||
"""Parses specific elements from a Docusarus page."""
|
||||
relevant_elements = content.select(",".join(self.custom_html_tags))
|
||||
|
||||
for element in relevant_elements:
|
||||
if element not in relevant_elements:
|
||||
element.decompose()
|
||||
|
||||
return str(content.get_text())
|
@ -0,0 +1,43 @@
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import DocusaurusLoader
|
||||
|
||||
DOCS_URL = str(Path(__file__).parent.parent / "examples/docusaurus-sitemap.xml")
|
||||
|
||||
|
||||
def test_docusarus() -> None:
|
||||
"""Test sitemap loader."""
|
||||
loader = DocusaurusLoader(DOCS_URL, is_local=True)
|
||||
documents = loader.load()
|
||||
assert len(documents) > 1
|
||||
assert "🦜️🔗 Langchain" in documents[0].page_content
|
||||
|
||||
|
||||
def test_filter_docusaurus_sitemap() -> None:
|
||||
"""Test sitemap loader."""
|
||||
loader = DocusaurusLoader(
|
||||
DOCS_URL,
|
||||
is_local=True,
|
||||
filter_urls=[
|
||||
"https://python.langchain.com/docs/integrations/document_loaders/sitemap"
|
||||
],
|
||||
)
|
||||
documents = loader.load()
|
||||
assert len(documents) == 1
|
||||
assert "SitemapLoader" in documents[0].page_content
|
||||
|
||||
|
||||
def test_docusarus_metadata() -> None:
|
||||
def sitemap_metadata_one(meta: dict, _content: None) -> dict:
|
||||
return {**meta, "mykey": "Super Important Metadata"}
|
||||
|
||||
"""Test sitemap loader."""
|
||||
loader = DocusaurusLoader(
|
||||
DOCS_URL,
|
||||
is_local=True,
|
||||
meta_function=sitemap_metadata_one,
|
||||
)
|
||||
documents = loader.load()
|
||||
assert len(documents) > 1
|
||||
assert "mykey" in documents[0].metadata
|
||||
assert "Super Important Metadata" in documents[0].metadata["mykey"]
|
@ -0,0 +1,42 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
|
||||
xmlns:xhtml="http://www.w3.org/1999/xhtml"
|
||||
xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"
|
||||
xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">
|
||||
<url>
|
||||
<loc>https://python.langchain.com/docs/integrations/document_loaders/sitemap</loc>
|
||||
<changefreq>weekly</changefreq>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://python.langchain.com/cookbook</loc>
|
||||
<changefreq>weekly</changefreq>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://python.langchain.com/docs/additional_resources</loc>
|
||||
<changefreq>weekly</changefreq>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://python.langchain.com/docs/modules/chains/how_to/</loc>
|
||||
<changefreq>weekly</changefreq>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://python.langchain.com/docs/use_cases/question_answering/local_retrieval_qa</loc>
|
||||
<changefreq>weekly</changefreq>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://python.langchain.com/docs/use_cases/summarization</loc>
|
||||
<changefreq>weekly</changefreq>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://python.langchain.com/</loc>
|
||||
<changefreq>weekly</changefreq>
|
||||
<priority>0.5</priority>
|
||||
</url>
|
||||
</urlset>
|
Loading…
Reference in New Issue
Block a user