mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Strip sitemap entries (#2132)
Loading this sitemap didn't work for me https://www.alzallies.com/sitemap.xml Changing this fixed it and it seems like a good idea to do it in general. Integration tests pass
This commit is contained in:
parent
27f80784d0
commit
4ab66c4f52
@ -58,7 +58,7 @@ class SitemapLoader(WebBaseLoader):
|
||||
|
||||
els = self.parse_sitemap(soup)
|
||||
|
||||
results = self.scrape_all([el["loc"] for el in els if "loc" in el])
|
||||
results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el])
|
||||
|
||||
return [
|
||||
Document(
|
||||
|
Loading…
Reference in New Issue
Block a user