community:Lazy load wikipedia dump file (#15111)

**Description:** the MWDumpLoader implementation currently does not support the lazy_load method, and the files are usually very large. We are proposing refactoring the load function, extracting two private functions with the functionality of loading the dump file and parsing a single page, to reuse the code in the lazy_load implementation.
6 months ago · be578f32be
parent 619cd3ce54
commit be578f32be
1 changed files with 34 additions and 16 deletions
--- a/libs/community/langchain_community/document_loaders/mediawikidump.py
+++ b/libs/community/langchain_community/document_loaders/mediawikidump.py
@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List, Optional, Sequence, Union
+from typing import Iterator, List, Optional, Sequence, Union

 from langchain_core.documents import Document

@ -60,37 +60,55 @@ class MWDumpLoader(BaseLoader):
        self.skip_redirects = skip_redirects
        self.stop_on_error = stop_on_error

-    def load(self) -> List[Document]:
-        """Load from a file path."""
+    def _load_dump_file(self):
        try:
-            import mwparserfromhell
            import mwxml
        except ImportError as e:
            raise ImportError(
-                "Unable to import 'mwparserfromhell' or 'mwxml'. Please install with"
-                " `pip install mwparserfromhell mwxml`."
+                "Unable to import 'mwxml'. Please install with" " `pip install mwxml`."
            ) from e

-        dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
+        return mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
+
+    def _load_single_page_from_dump(self, page) -> Document:
+        """Parse a single page."""
+        try:
+            import mwparserfromhell
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import 'mwparserfromhell'. Please install with"
+                " `pip install mwparserfromhell`."
+            ) from e
+        for revision in page:
+            code = mwparserfromhell.parse(revision.text)
+            text = code.strip_code(
+                normalize=True, collapse=True, keep_template_params=False
+            )
+            metadata = {"source": page.title}
+            return Document(page_content=text, metadata=metadata)
+
+    def load(self) -> List[Document]:
+        """Load from a file path."""
+
+        return [doc for doc in self.lazy_load()]
+
+    def lazy_load(
+        self,
+    ) -> Iterator[Document]:
+        """Lazy load from a file path."""
+
+        dump = self._load_dump_file()

-        docs = []
        for page in dump.pages:
            if self.skip_redirects and page.redirect:
                continue
            if self.namespaces and page.namespace not in self.namespaces:
                continue
            try:
-                for revision in page:
-                    code = mwparserfromhell.parse(revision.text)
-                    text = code.strip_code(
-                        normalize=True, collapse=True, keep_template_params=False
-                    )
-                    metadata = {"source": page.title}
-                    docs.append(Document(page_content=text, metadata=metadata))
+                yield self._load_single_page_from_dump(page)
            except Exception as e:
                logger.error("Parsing error: {}".format(e))
                if self.stop_on_error:
                    raise e
                else:
                    continue
-        return docs