community:Lazy load wikipedia dump file (#15111)

**Description:** the MWDumpLoader implementation currently does not
support the lazy_load method, and the files are usually very large. We
are proposing refactoring the load function, extracting two private
functions with the functionality of loading the dump file and parsing a
single page, to reuse the code in the lazy_load implementation.
pull/15102/head^2
Christian Janiake 8 months ago committed by GitHub
parent 619cd3ce54
commit be578f32be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,6 +1,6 @@
import logging import logging
from pathlib import Path from pathlib import Path
from typing import List, Optional, Sequence, Union from typing import Iterator, List, Optional, Sequence, Union
from langchain_core.documents import Document from langchain_core.documents import Document
@ -60,37 +60,55 @@ class MWDumpLoader(BaseLoader):
self.skip_redirects = skip_redirects self.skip_redirects = skip_redirects
self.stop_on_error = stop_on_error self.stop_on_error = stop_on_error
def load(self) -> List[Document]: def _load_dump_file(self):
"""Load from a file path."""
try: try:
import mwparserfromhell
import mwxml import mwxml
except ImportError as e: except ImportError as e:
raise ImportError( raise ImportError(
"Unable to import 'mwparserfromhell' or 'mwxml'. Please install with" "Unable to import 'mwxml'. Please install with" " `pip install mwxml`."
" `pip install mwparserfromhell mwxml`."
) from e ) from e
dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding)) return mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
docs = [] def _load_single_page_from_dump(self, page) -> Document:
for page in dump.pages: """Parse a single page."""
if self.skip_redirects and page.redirect:
continue
if self.namespaces and page.namespace not in self.namespaces:
continue
try: try:
import mwparserfromhell
except ImportError as e:
raise ImportError(
"Unable to import 'mwparserfromhell'. Please install with"
" `pip install mwparserfromhell`."
) from e
for revision in page: for revision in page:
code = mwparserfromhell.parse(revision.text) code = mwparserfromhell.parse(revision.text)
text = code.strip_code( text = code.strip_code(
normalize=True, collapse=True, keep_template_params=False normalize=True, collapse=True, keep_template_params=False
) )
metadata = {"source": page.title} metadata = {"source": page.title}
docs.append(Document(page_content=text, metadata=metadata)) return Document(page_content=text, metadata=metadata)
def load(self) -> List[Document]:
"""Load from a file path."""
return [doc for doc in self.lazy_load()]
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load from a file path."""
dump = self._load_dump_file()
for page in dump.pages:
if self.skip_redirects and page.redirect:
continue
if self.namespaces and page.namespace not in self.namespaces:
continue
try:
yield self._load_single_page_from_dump(page)
except Exception as e: except Exception as e:
logger.error("Parsing error: {}".format(e)) logger.error("Parsing error: {}".format(e))
if self.stop_on_error: if self.stop_on_error:
raise e raise e
else: else:
continue continue
return docs

Loading…
Cancel
Save