diff --git a/langchain/document_loaders/sitemap.py b/langchain/document_loaders/sitemap.py index b376d0f1..1bc583cd 100644 --- a/langchain/document_loaders/sitemap.py +++ b/langchain/document_loaders/sitemap.py @@ -1,21 +1,31 @@ """Loader that fetches a sitemap and loads those URLs.""" import re -from typing import Any, List, Optional +from typing import Any, Callable, List, Optional from langchain.document_loaders.web_base import WebBaseLoader from langchain.schema import Document +def _default_parsing_function(content: Any) -> str: + return str(content.get_text()) + + class SitemapLoader(WebBaseLoader): """Loader that fetches a sitemap and loads those URLs.""" - def __init__(self, web_path: str, filter_urls: Optional[List[str]] = None): + def __init__( + self, + web_path: str, + filter_urls: Optional[List[str]] = None, + parsing_function: Optional[Callable] = None, + ): """Initialize with webpage path and optional filter URLs. Args: web_path: url of the sitemap filter_urls: list of strings or regexes that will be applied to filter the - urls that are parsed and loaded + urls that are parsed and loaded + parsing_function: Function to parse bs4.Soup output """ try: @@ -28,6 +38,7 @@ class SitemapLoader(WebBaseLoader): super().__init__(web_path) self.filter_urls = filter_urls + self.parsing_function = parsing_function or _default_parsing_function def parse_sitemap(self, soup: Any) -> List[dict]: """Parse sitemap xml and load into a list of dicts.""" @@ -62,7 +73,7 @@ class SitemapLoader(WebBaseLoader): return [ Document( - page_content=str(results[i].get_text()), + page_content=self.parsing_function(results[i]), metadata={**{"source": els[i]["loc"]}, **els[i]}, ) for i in range(len(results))