mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Support recursive sitemaps in SitemapLoader (#3146)
A (very) simple addition to support multiple sitemap urls. --------- Co-authored-by: Johann-Peter Hartmann <johann-peter.hartmann@mayflower.de>
This commit is contained in:
parent
215dcc2d26
commit
7e79f8c136
@ -61,6 +61,13 @@ class SitemapLoader(WebBaseLoader):
|
||||
}
|
||||
)
|
||||
|
||||
for sitemap in soup.find_all("sitemap"):
|
||||
loc = sitemap.find("loc")
|
||||
if not loc:
|
||||
continue
|
||||
soup_child = self.scrape_all([loc.text], "xml")[0]
|
||||
|
||||
els.extend(self.parse_sitemap(soup_child))
|
||||
return els
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
|
Loading…
Reference in New Issue
Block a user