mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Support recursive sitemaps in SitemapLoader (#3146)
A (very) simple addition to support multiple sitemap urls. --------- Co-authored-by: Johann-Peter Hartmann <johann-peter.hartmann@mayflower.de>
This commit is contained in:
parent
215dcc2d26
commit
7e79f8c136
@ -61,6 +61,13 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for sitemap in soup.find_all("sitemap"):
|
||||||
|
loc = sitemap.find("loc")
|
||||||
|
if not loc:
|
||||||
|
continue
|
||||||
|
soup_child = self.scrape_all([loc.text], "xml")[0]
|
||||||
|
|
||||||
|
els.extend(self.parse_sitemap(soup_child))
|
||||||
return els
|
return els
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
|
Loading…
Reference in New Issue
Block a user