forked from Archives/langchain
Strips whitespace and \n from loc before filtering urls from sitemap (#5728)
Fixes #5699 #### Who can review? Tag maintainers/contributors who might be interested: @woodworker @LeSphax @johannhartmann --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
98dd6d068a
commit
2dcda8a8ac
@ -79,8 +79,11 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
if not loc:
|
if not loc:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Strip leading and trailing whitespace and newlines
|
||||||
|
loc_text = loc.text.strip()
|
||||||
|
|
||||||
if self.filter_urls and not any(
|
if self.filter_urls and not any(
|
||||||
re.match(r, loc.text) for r in self.filter_urls
|
re.match(r, loc_text) for r in self.filter_urls
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user