Strips whitespace and \n from loc before filtering urls from sitemap (#5728)

Fixes #5699 



#### Who can review?

Tag maintainers/contributors who might be interested:

@woodworker @LeSphax @johannhartmann

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Shelby Jenkins 2023-06-05 18:33:55 -05:00 committed by GitHub
parent 98dd6d068a
commit 2dcda8a8ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -79,8 +79,11 @@ class SitemapLoader(WebBaseLoader):
if not loc: if not loc:
continue continue
# Strip leading and trailing whitespace and newlines
loc_text = loc.text.strip()
if self.filter_urls and not any( if self.filter_urls and not any(
re.match(r, loc.text) for r in self.filter_urls re.match(r, loc_text) for r in self.filter_urls
): ):
continue continue