diff --git a/libs/community/langchain_community/document_loaders/mediawikidump.py b/libs/community/langchain_community/document_loaders/mediawikidump.py index dd18997430..360420c536 100644 --- a/libs/community/langchain_community/document_loaders/mediawikidump.py +++ b/libs/community/langchain_community/document_loaders/mediawikidump.py @@ -1,6 +1,6 @@ import logging from pathlib import Path -from typing import List, Optional, Sequence, Union +from typing import Iterator, List, Optional, Sequence, Union from langchain_core.documents import Document @@ -60,37 +60,55 @@ class MWDumpLoader(BaseLoader): self.skip_redirects = skip_redirects self.stop_on_error = stop_on_error - def load(self) -> List[Document]: - """Load from a file path.""" + def _load_dump_file(self): try: - import mwparserfromhell import mwxml except ImportError as e: raise ImportError( - "Unable to import 'mwparserfromhell' or 'mwxml'. Please install with" - " `pip install mwparserfromhell mwxml`." + "Unable to import 'mwxml'. Please install with" " `pip install mwxml`." ) from e - dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding)) + return mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding)) + + def _load_single_page_from_dump(self, page) -> Document: + """Parse a single page.""" + try: + import mwparserfromhell + except ImportError as e: + raise ImportError( + "Unable to import 'mwparserfromhell'. Please install with" + " `pip install mwparserfromhell`." + ) from e + for revision in page: + code = mwparserfromhell.parse(revision.text) + text = code.strip_code( + normalize=True, collapse=True, keep_template_params=False + ) + metadata = {"source": page.title} + return Document(page_content=text, metadata=metadata) + + def load(self) -> List[Document]: + """Load from a file path.""" + + return [doc for doc in self.lazy_load()] + + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazy load from a file path.""" + + dump = self._load_dump_file() - docs = [] for page in dump.pages: if self.skip_redirects and page.redirect: continue if self.namespaces and page.namespace not in self.namespaces: continue try: - for revision in page: - code = mwparserfromhell.parse(revision.text) - text = code.strip_code( - normalize=True, collapse=True, keep_template_params=False - ) - metadata = {"source": page.title} - docs.append(Document(page_content=text, metadata=metadata)) + yield self._load_single_page_from_dump(page) except Exception as e: logger.error("Parsing error: {}".format(e)) if self.stop_on_error: raise e else: continue - return docs