@ -11,6 +11,10 @@ def _default_parsing_function(content: Any) -> str:
return str ( content . get_text ( ) )
def _default_meta_function ( meta : dict , _content : Any ) - > dict :
return { " source " : meta [ " loc " ] , * * meta }
def _batch_block ( iterable : Iterable , size : int ) - > Generator [ List [ dict ] , None , None ] :
it = iter ( iterable )
while item := list ( itertools . islice ( it , size ) ) :
@ -27,6 +31,7 @@ class SitemapLoader(WebBaseLoader):
parsing_function : Optional [ Callable ] = None ,
blocksize : Optional [ int ] = None ,
blocknum : int = 0 ,
meta_function : Optional [ Callable ] = None ,
) :
""" Initialize with webpage path and optional filter URLs.
@ -37,6 +42,9 @@ class SitemapLoader(WebBaseLoader):
parsing_function : Function to parse bs4 . Soup output
blocksize : number of sitemap locations per block
blocknum : the number of the block that should be loaded - zero indexed
meta_function : Function to parse bs4 . Soup output for metadata
remember when setting this method to also copy metadata [ " loc " ]
to metadata [ " source " ] if you are using this field
"""
if blocksize is not None and blocksize < 1 :
@ -56,6 +64,7 @@ class SitemapLoader(WebBaseLoader):
self . filter_urls = filter_urls
self . parsing_function = parsing_function or _default_parsing_function
self . meta_function = meta_function or _default_meta_function
self . blocksize = blocksize
self . blocknum = blocknum
@ -110,7 +119,7 @@ class SitemapLoader(WebBaseLoader):
return [
Document (
page_content = self . parsing_function ( results [ i ] ) ,
metadata = { * * { " source " : els [ i ] [ " loc " ] } , * * els [ i ] } ,
metadata = self . meta_function ( els [ i ] , results [ i ] ) ,
)
for i in range ( len ( results ) )
]