@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import List, Optional , Sequence , Union
from typing import Iterator, List, Optional , Sequence , Union
from langchain_core . documents import Document
@ -60,37 +60,55 @@ class MWDumpLoader(BaseLoader):
self . skip_redirects = skip_redirects
self . stop_on_error = stop_on_error
def load ( self ) - > List [ Document ] :
""" Load from a file path. """
def _load_dump_file ( self ) :
try :
import mwparserfromhell
import mwxml
except ImportError as e :
raise ImportError (
" Unable to import ' mwparserfromhell ' or ' mwxml ' . Please install with "
" `pip install mwparserfromhell mwxml`. "
" Unable to import ' mwxml ' . Please install with " " `pip install mwxml`. "
) from e
dump = mwxml . Dump . from_file ( open ( self . file_path , encoding = self . encoding ) )
return mwxml . Dump . from_file ( open ( self . file_path , encoding = self . encoding ) )
def _load_single_page_from_dump ( self , page ) - > Document :
""" Parse a single page. """
try :
import mwparserfromhell
except ImportError as e :
raise ImportError (
" Unable to import ' mwparserfromhell ' . Please install with "
" `pip install mwparserfromhell`. "
) from e
for revision in page :
code = mwparserfromhell . parse ( revision . text )
text = code . strip_code (
normalize = True , collapse = True , keep_template_params = False
)
metadata = { " source " : page . title }
return Document ( page_content = text , metadata = metadata )
def load ( self ) - > List [ Document ] :
""" Load from a file path. """
return [ doc for doc in self . lazy_load ( ) ]
def lazy_load (
self ,
) - > Iterator [ Document ] :
""" Lazy load from a file path. """
dump = self . _load_dump_file ( )
docs = [ ]
for page in dump . pages :
if self . skip_redirects and page . redirect :
continue
if self . namespaces and page . namespace not in self . namespaces :
continue
try :
for revision in page :
code = mwparserfromhell . parse ( revision . text )
text = code . strip_code (
normalize = True , collapse = True , keep_template_params = False
)
metadata = { " source " : page . title }
docs . append ( Document ( page_content = text , metadata = metadata ) )
yield self . _load_single_page_from_dump ( page )
except Exception as e :
logger . error ( " Parsing error: {} " . format ( e ) )
if self . stop_on_error :
raise e
else :
continue
return docs