mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Strips whitespace and \n from loc before filtering urls from sitemap (#5728)
Fixes #5699 #### Who can review? Tag maintainers/contributors who might be interested: @woodworker @LeSphax @johannhartmann --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
98dd6d068a
commit
2dcda8a8ac
@ -79,8 +79,11 @@ class SitemapLoader(WebBaseLoader):
|
||||
if not loc:
|
||||
continue
|
||||
|
||||
# Strip leading and trailing whitespace and newlines
|
||||
loc_text = loc.text.strip()
|
||||
|
||||
if self.filter_urls and not any(
|
||||
re.match(r, loc.text) for r in self.filter_urls
|
||||
re.match(r, loc_text) for r in self.filter_urls
|
||||
):
|
||||
continue
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user