langchain/langchain/document_loaders/mediawikidump.py
Harrison Chase 5a269d3175
Harrison/media wiki xml (#4072)
Co-authored-by: Géraud de Drouas <gdedrouas@users.noreply.github.com>
2023-05-03 20:45:33 -07:00

58 lines
1.7 KiB
Python

"""Load Data from a MediaWiki dump xml."""
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class MWDumpLoader(BaseLoader):
"""
Load MediaWiki dump from XML file
Example:
.. code-block:: python
from langchain.document_loaders import MWDumpLoader
loader = MWDumpLoader(
file_path="myWiki.xml",
encoding="utf8"
)
docs = loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=0
)
texts = text_splitter.split_documents(docs)
:param file_path: XML local file path
:type file_path: str
:param encoding: Charset encoding, defaults to "utf8"
:type encoding: str, optional
"""
def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
"""Initialize with file path."""
self.file_path = file_path
self.encoding = encoding
def load(self) -> List[Document]:
"""Load from file path."""
import mwparserfromhell
import mwxml
dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
docs = []
for page in dump.pages:
for revision in page:
code = mwparserfromhell.parse(revision.text)
text = code.strip_code(
normalize=True, collapse=True, keep_template_params=False
)
metadata = {"source": page.title}
docs.append(Document(page_content=text, metadata=metadata))
return docs