From 7e79f8c136452cbde767ea9a80e4a4caed23fd16 Mon Sep 17 00:00:00 2001 From: Johann-Peter Hartmann Date: Sat, 22 Apr 2023 17:48:04 +0200 Subject: [PATCH] Support recursive sitemaps in SitemapLoader (#3146) A (very) simple addition to support multiple sitemap urls. --------- Co-authored-by: Johann-Peter Hartmann --- langchain/document_loaders/sitemap.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/langchain/document_loaders/sitemap.py b/langchain/document_loaders/sitemap.py index 1bc583cd..3a417dd0 100644 --- a/langchain/document_loaders/sitemap.py +++ b/langchain/document_loaders/sitemap.py @@ -61,6 +61,13 @@ class SitemapLoader(WebBaseLoader): } ) + for sitemap in soup.find_all("sitemap"): + loc = sitemap.find("loc") + if not loc: + continue + soup_child = self.scrape_all([loc.text], "xml")[0] + + els.extend(self.parse_sitemap(soup_child)) return els def load(self) -> List[Document]: