community:Lazy load wikipedia dump file (#15111)

**Description:** the MWDumpLoader implementation currently does not support the lazy_load method, and the files are usually very large. We are proposing refactoring the load function, extracting two private functions with the functionality of loading the dump file and parsing a single page, to reuse the code in the lazy_load implementation.
8 months ago · be578f32be
parent 619cd3ce54
commit be578f32be
1 changed files with 34 additions and 16 deletions
--- a/libs/community/langchain_community/document_loaders/mediawikidump.py
+++ b/libs/community/langchain_community/document_loaders/mediawikidump.py
@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Optional, Sequence, Union
+from typing import Iterator, List, Optional, Sequence, Union
 from langchain_core.documents import Document
@ -60,37 +60,55 @@ class MWDumpLoader(BaseLoader):
        self.skip_redirects = skip_redirects
        self.stop_on_error = stop_on_error
-    def load(self) -> List[Document]:
+    def _load_dump_file(self):
        """Load from a file path."""
        try:
            import mwparserfromhell
            import mwxml
        except ImportError as e:
            raise ImportError(
-                "Unable to import 'mwparserfromhell' or 'mwxml'. Please install with"
+                "Unable to import 'mwxml'. Please install with" " `pip install mwxml`."
                " `pip install mwparserfromhell mwxml`."
            ) from e
-        dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
+        return mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
-        docs = []
+    def _load_single_page_from_dump(self, page) -> Document:
-        for page in dump.pages:
+        """Parse a single page."""
            if self.skip_redirects and page.redirect:
                continue
            if self.namespaces and page.namespace not in self.namespaces:
                continue
        try:
            import mwparserfromhell
        except ImportError as e:
            raise ImportError(
                "Unable to import 'mwparserfromhell'. Please install with"
                " `pip install mwparserfromhell`."
            ) from e
        for revision in page:
            code = mwparserfromhell.parse(revision.text)
            text = code.strip_code(
                normalize=True, collapse=True, keep_template_params=False
            )
            metadata = {"source": page.title}
-                    docs.append(Document(page_content=text, metadata=metadata))
+            return Document(page_content=text, metadata=metadata)
    def load(self) -> List[Document]:
        """Load from a file path."""
        return [doc for doc in self.lazy_load()]
    def lazy_load(
        self,
    ) -> Iterator[Document]:
        """Lazy load from a file path."""
        dump = self._load_dump_file()
        for page in dump.pages:
            if self.skip_redirects and page.redirect:
                continue
            if self.namespaces and page.namespace not in self.namespaces:
                continue
            try:
                yield self._load_single_page_from_dump(page)
            except Exception as e:
                logger.error("Parsing error: {}".format(e))
                if self.stop_on_error:
                    raise e
                else:
                    continue
        return docs