From 4ab66c4f52cefdd8734f48c72804730541171c3d Mon Sep 17 00:00:00 2001 From: Sebastien Kerbrat Date: Tue, 28 Mar 2023 22:56:07 -0700 Subject: [PATCH] Strip sitemap entries (#2132) Loading this sitemap didn't work for me https://www.alzallies.com/sitemap.xml Changing this fixed it and it seems like a good idea to do it in general. Integration tests pass --- langchain/document_loaders/sitemap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain/document_loaders/sitemap.py b/langchain/document_loaders/sitemap.py index 4a2c1d2809..b376d0f1f3 100644 --- a/langchain/document_loaders/sitemap.py +++ b/langchain/document_loaders/sitemap.py @@ -58,7 +58,7 @@ class SitemapLoader(WebBaseLoader): els = self.parse_sitemap(soup) - results = self.scrape_all([el["loc"] for el in els if "loc" in el]) + results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el]) return [ Document(