MediaWiki docloader improvements + unit tests (#5879)

Starting over from #5654 because I utterly borked the poetry.lock file. Adds new paramerters for to the MWDumpLoader class: * skip_redirecst (bool) Tells the loader to skip articles that redirect to other articles. False by default. * stop_on_error (bool) Tells the parser to skip any page that causes a parse error. True by default. * namespaces (List[int]) Tells the parser which namespaces to parse. Contains namespaces from -2 to 15 by default. Default values are chosen to preserve backwards compatibility. Sample dump XML and full unit test coverage (with extended tests that pass!) also included! --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
11 months ago · 96f3dff050
parent 4c8106311f
commit 96f3dff050
5 changed files with 447 additions and 29 deletions
--- a/langchain/document_loaders/mediawikidump.py
+++ b/langchain/document_loaders/mediawikidump.py
@ -1,9 +1,13 @@
 """Load Data from a MediaWiki dump xml."""
-from typing import List, Optional
+import logging
+from pathlib import Path
+from typing import List, Optional, Sequence, Union

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader

+logger = logging.getLogger(__name__)
+

 class MWDumpLoader(BaseLoader):
    """
@ -29,34 +33,64 @@ class MWDumpLoader(BaseLoader):
    :type file_path: str
    :param encoding: Charset encoding, defaults to "utf8"
    :type encoding: str, optional
+    :param namespaces: The namespace of pages you want to parse.
+        See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
+        for a list of all common namespaces
+    :type namespaces: List[int],optional
+    :param skip_redirects: TR=rue to skip pages that redirect to other pages,
+        False to keep them. False by default
+    :type skip_redirects: bool, optional
+    :param stop_on_error: False to skip over pages that cause parsing errors,
+        True to stop. True by default
+    :type stop_on_error: bool, optional
    """

-    def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
-        """Initialize with a file path.
-
-        Args:
-            file_path: XML local file path
-            encoding: Charset encoding, defaults to "utf8"
-        """
-        self.file_path = file_path
+    def __init__(
+        self,
+        file_path: Union[str, Path],
+        encoding: Optional[str] = "utf8",
+        namespaces: Optional[Sequence[int]] = None,
+        skip_redirects: Optional[bool] = False,
+        stop_on_error: Optional[bool] = True,
+    ):
+        self.file_path = file_path if isinstance(file_path, str) else str(file_path)
        self.encoding = encoding
+        # Namespaces range from -2 to 15, inclusive.
+        self.namespaces = namespaces or list(range(-2, 16))
+        self.skip_redirects = skip_redirects
+        self.stop_on_error = stop_on_error

    def load(self) -> List[Document]:
        """Load from a file path."""
-        import mwparserfromhell
-        import mwxml
+        try:
+            import mwparserfromhell
+            import mwxml
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import 'mwparserfromhell' or 'mwxml'. Please install with"
+                " `pip install mwparserfromhell mwxml`."
+            ) from e

        dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))

        docs = []
-
        for page in dump.pages:
-            for revision in page:
-                code = mwparserfromhell.parse(revision.text)
-                text = code.strip_code(
-                    normalize=True, collapse=True, keep_template_params=False
-                )
-                metadata = {"source": page.title}
-                docs.append(Document(page_content=text, metadata=metadata))
-
+            if self.skip_redirects and page.redirect:
+                continue
+            if page.namespace not in self.namespaces:
+                continue
+            try:
+                for revision in page:
+                    code = mwparserfromhell.parse(revision.text)
+                    text = code.strip_code(
+                        normalize=True, collapse=True, keep_template_params=False
+                    )
+                    metadata = {"source": page.title}
+                    docs.append(Document(page_content=text, metadata=metadata))
+            except Exception as e:
+                logger.error("Parsing error: {}".format(e))
+                if self.stop_on_error:
+                    raise e
+                else:
+                    continue
        return docs
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.

 [[package]]
 name = "absl-py"
@ -2402,6 +2402,17 @@ websocket-client = ">=0.32.0"
 [package.extras]
 ssh = ["paramiko (>=2.4.3)"]

+[[package]]
+name = "docopt"
+version = "0.6.2"
+description = "Pythonic argument parser, that will make you smile"
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"},
+]
+
 [[package]]
 name = "docutils"
 version = "0.17.1"
@ -4358,6 +4369,18 @@ files = [
    {file = "jq-1.4.1.tar.gz", hash = "sha256:52284ee3cb51670e6f537b0ec813654c064c1c0705bd910097ea0fe17313516d"},
 ]

+[[package]]
+name = "jsonable"
+version = "0.3.1"
+description = "An abstract class that supports jsonserialization/deserialization."
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "jsonable-0.3.1-py2.py3-none-any.whl", hash = "sha256:f7754dd27b4734e42e7f8a61c2336bc98082f715e31e29a061a95843b102dc3a"},
+    {file = "jsonable-0.3.1.tar.gz", hash = "sha256:137b676e8e5819fa58518678c3d1f5463cab7e8466f69b3641cbc438042eaee4"},
+]
+
 [[package]]
 name = "jsonlines"
 version = "3.1.0"
@ -4382,7 +4405,6 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
 files = [
    {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
-    {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
 ]

 [[package]]
@ -5654,6 +5676,94 @@ files = [
    {file = "murmurhash-1.0.9.tar.gz", hash = "sha256:fe7a38cb0d3d87c14ec9dddc4932ffe2dbc77d75469ab80fd5014689b0e07b58"},
 ]

+[[package]]
+name = "mwcli"
+version = "0.0.3"
+description = "Utilities for processing MediaWiki on the command line."
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "mwcli-0.0.3-py2.py3-none-any.whl", hash = "sha256:24a7e53730e6fa7e55626e4f2a61a0b016d5e0a9798306c1d8c71bcead0ab239"},
+    {file = "mwcli-0.0.3.tar.gz", hash = "sha256:00331bd0ff16b5721c9c6274d91e25fd355f45ec0773c8a0e3926eac058719a0"},
+]
+
+[package.dependencies]
+docopt = "*"
+mwxml = "*"
+para = "*"
+
+[[package]]
+name = "mwparserfromhell"
+version = "0.6.4"
+description = "MWParserFromHell is a parser for MediaWiki wikicode."
+category = "main"
+optional = true
+python-versions = ">= 3.6"
+files = [
+    {file = "mwparserfromhell-0.6.4-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:fc4f5718e761a3f5ad76eb9089e0792ed3a6786095abe098e37e7ac7af76afef"},
+    {file = "mwparserfromhell-0.6.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7f19e7d064c467f32e0704becd81c841a807335934134d6aa859d98d01c7cf3"},
+    {file = "mwparserfromhell-0.6.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:511ff847cddb8e7014b6afb0af5dbdb5cf05ada67e31fc39efa34fbbdccb8e8b"},
+    {file = "mwparserfromhell-0.6.4-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:60d86c8d3501edc1331b37df72b74689ee392da077c36a8b453460b8e3714cdd"},
+    {file = "mwparserfromhell-0.6.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54d9819c11530fc00b8a70fa3508898109b3df72336f7b8e52f8faffbe03ee88"},
+    {file = "mwparserfromhell-0.6.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d983914c19dee5c2a13298b1ccd3a1ed2b65c81d322b7e7df99cd5386a460c6"},
+    {file = "mwparserfromhell-0.6.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1c908c9738c5c6bce04b825b3f95592d971ff439ace294a86fc758070afc6d0c"},
+    {file = "mwparserfromhell-0.6.4-cp36-cp36m-win32.whl", hash = "sha256:abae1052b9c12a8814c76dd26ec9cfdd71102e7f89c28fb58a5fba7ee55ad1bc"},
+    {file = "mwparserfromhell-0.6.4-cp36-cp36m-win_amd64.whl", hash = "sha256:f8c450c39ef647678831ecf9a1f8236521d369afc4ae59a9c601d07f298eda35"},
+    {file = "mwparserfromhell-0.6.4-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:5dfd7f57fa3d516b21790ef7f6094119082baa2e6072cef78fb9f999b77e674f"},
+    {file = "mwparserfromhell-0.6.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd0a74474ed6e85808c874511d28a253ffd2d1e5a3abe915705a25804212ac73"},
+    {file = "mwparserfromhell-0.6.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a344ceabde013aa2f9b23494e73af11b99795f63a30124e955c185de2c8ae397"},
+    {file = "mwparserfromhell-0.6.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:82010e5b5da130cbcb002747f5592ffca73488e0e9cf1ebdfef6e8559c535c41"},
+    {file = "mwparserfromhell-0.6.4-cp37-cp37m-win32.whl", hash = "sha256:2d6e124396ee41c35ea12017a66c560abb1f7f51bee04e631a149318adaf15e2"},
+    {file = "mwparserfromhell-0.6.4-cp37-cp37m-win_amd64.whl", hash = "sha256:0519497b8a7472298324ef92e1e82c1ab5cab85b4d64462d7ae46c4464c8b872"},
+    {file = "mwparserfromhell-0.6.4-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:dd81220d66bf829664a6f911b3f58e7af3061fd7fdee68c0fc9731f5bcd7519d"},
+    {file = "mwparserfromhell-0.6.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:203ad9cd78dec7480fde45c9f49d0bc2a2eaa28fa1b585461fb9f56f6587f46c"},
+    {file = "mwparserfromhell-0.6.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d63d76f576e133c14a29f1ad2f3fc2afa17b74945ebc017e8d7d3bcb59f5243c"},
+    {file = "mwparserfromhell-0.6.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7f5682ab9e1a55b20e9fb669582493d196c76a512276456848153c39d726d7d2"},
+    {file = "mwparserfromhell-0.6.4-cp38-cp38-win32.whl", hash = "sha256:e0c3d3bc409f8ac1221639ee2dab0dc830711d9a56a39014aad2824c2c98b3e2"},
+    {file = "mwparserfromhell-0.6.4-cp38-cp38-win_amd64.whl", hash = "sha256:18dac5162471c38e5bbf6ee3698c49d2753e8dde372864112fdaf81047ce89d3"},
+    {file = "mwparserfromhell-0.6.4-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:7c822985760b9e82857ecfb99dbb60ac35d0ebf7b2977a0215c7c56fe70c2b68"},
+    {file = "mwparserfromhell-0.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d521a2e3787c83ecf607a7806ae655d32f3c3884b2dcf35a388183c6028ddce4"},
+    {file = "mwparserfromhell-0.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42eb94a6bad20b7f8845fd2900a45373cb4d414d5a357b27457c7c6c259115c5"},
+    {file = "mwparserfromhell-0.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:da2568e2a492dcc913f1c026434371af7a24e05dc56450c3ab063d9e580b48f2"},
+    {file = "mwparserfromhell-0.6.4-cp39-cp39-win32.whl", hash = "sha256:3e5f4bb96b68557acd14c4baa62cbe440b6e6d0f5263cb4860d37e1ceeada2a7"},
+    {file = "mwparserfromhell-0.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:0c2bbb36110410b5f6d6d8b2f35f65f8ec8f57c0477609d35bcaac4784a59e5a"},
+    {file = "mwparserfromhell-0.6.4.tar.gz", hash = "sha256:92bec9528ae34d272893ccaf2b527df85c314ff28cfbb3056340467b095d834c"},
+]
+
+[[package]]
+name = "mwtypes"
+version = "0.3.2"
+description = "A set of types for processing MediaWiki data."
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "mwtypes-0.3.2-py2.py3-none-any.whl", hash = "sha256:d6f3cae90eea4c88bc260101c8a082fb0ab22cca88e7474657b28cd9538794f3"},
+    {file = "mwtypes-0.3.2.tar.gz", hash = "sha256:dc1176c5965629c123e859b319ae6151d4e385531e9a781604c0d4ca3434e399"},
+]
+
+[package.dependencies]
+jsonable = ">=0.3.0"
+
+[[package]]
+name = "mwxml"
+version = "0.3.3"
+description = "A set of utilities for processing MediaWiki XML dump data."
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "mwxml-0.3.3-py2.py3-none-any.whl", hash = "sha256:9695848b8b6987b6f6addc2a8accba5b2bcbc543702598194e182b508ab568a9"},
+    {file = "mwxml-0.3.3.tar.gz", hash = "sha256:0848df0cf2e293718f554311acf4715bd679f639f4e52cbe47d8206589db1d31"},
+]
+
+[package.dependencies]
+jsonschema = ">=2.5.1"
+mwcli = ">=0.0.2"
+mwtypes = ">=0.3.0"
+para = ">=0.0.1"
+
 [[package]]
 name = "mypy"
 version = "0.991"
@ -7061,6 +7171,18 @@ files = [
    {file = "pandocfilters-1.5.0.tar.gz", hash = "sha256:0b679503337d233b4339a817bfc8c50064e2eff681314376a47cb582305a7a38"},
 ]

+[[package]]
+name = "para"
+version = "0.0.8"
+description = "a set utilities that ake advantage of python's 'multiprocessing' module to distribute CPU-intensive tasks"
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "para-0.0.8-py3-none-any.whl", hash = "sha256:c63b030658cafd84f8fabfc000142324d51c7440e50ef5012fd1a54972ca25f4"},
+    {file = "para-0.0.8.tar.gz", hash = "sha256:46c3232ae9d8ea9d886cfd08cdd112892202bed8645f40b6255597ba4cfef217"},
+]
+
 [[package]]
 name = "parso"
 version = "0.8.3"
@ -11445,7 +11567,7 @@ files = [
 ]

 [package.dependencies]
-accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\""}
+accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\" or extra == \"torch\""}
 filelock = "*"
 huggingface-hub = ">=0.14.1,<1.0"
 numpy = ">=1.17"
@ -12702,15 +12824,15 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 cffi = ["cffi (>=1.11)"]

 [extras]
-all = ["anthropic", "clarifai", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "marqo", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "libdeeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python", "awadb", "esprima", "octoai-sdk", "rdflib"]
-azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-search-documents"]
+all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "libdeeplake", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "octoai-sdk", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "spacy", "steamship", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
+azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"]
 clarifai = ["clarifai"]
 cohere = ["cohere"]
 docarray = ["docarray"]
 embeddings = ["sentence-transformers"]
-extended-testing = ["beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "jq", "pdfminer-six", "pgvector", "pypdf", "pymupdf", "pypdfium2", "tqdm", "lxml", "atlassian-python-api", "beautifulsoup4", "pandas", "telethon", "psychicapi", "zep-python", "gql", "requests-toolbelt", "html2text", "py-trello", "scikit-learn", "streamlit", "pyspark", "openai", "sympy", "rapidfuzz"]
+extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "mwparserfromhell", "mwxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "zep-python"]
 javascript = ["esprima"]
-llms = ["anthropic", "clarifai", "cohere", "openai", "openllm", "openlm", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
+llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers"]
 openai = ["openai", "tiktoken"]
 qdrant = ["qdrant-client"]
 text-helpers = ["chardet"]
@ -12718,4 +12840,4 @@ text-helpers = ["chardet"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "7c3eeaa43dead997a66d01a3ba3799656d216d20011329f6a14fcf653cc658b7"
+content-hash = "14b61a9483499285cf02b991eb176a540d76704c6859c525827e8e271e106c4a"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -108,6 +108,8 @@ pyspark = {version = "^3.4.0", optional = true}
 clarifai = {version = ">=9.1.0", optional = true}
 tigrisdb = {version = "^1.0.0b6", optional = true}
 nebula3-python = {version = "^3.4.0", optional = true}
+mwparserfromhell = {version = "^0.6.4", optional = true}
+mwxml = {version = "^0.3.3", optional = true}
 awadb = {version = "^0.3.3", optional = true}
 azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
 esprima = {version = "^4.0.1", optional = true}
@ -343,7 +345,8 @@ extended_testing = [
 "tqdm",
 "lxml",
 "atlassian-python-api",
- "beautifulsoup4",
+ "mwparserfromhell",
+ "mwxml",
 "pandas",
 "telethon",
 "psychicapi",
--- a/tests/unit_tests/document_loaders/sample_documents/mwtest_current_pages.xml
+++ b/tests/unit_tests/document_loaders/sample_documents/mwtest_current_pages.xml
@ -0,0 +1,211 @@
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="en">
+  <siteinfo>
+    <sitename>Text Wiki</sitename>
+    <dbname>test123</dbname>
+    <base>http://control.fandom.com/wiki/Control_Wiki</base>
+    <generator>MediaWiki 1.37.3</generator>
+    <case>first-letter</case>
+    <namespaces>
+      <namespace key="-2" case="first-letter">Media</namespace>
+      <namespace key="-1" case="first-letter">Special</namespace>
+      <namespace key="0" case="first-letter" />
+      <namespace key="1" case="first-letter">Talk</namespace>
+      <namespace key="2" case="first-letter">User</namespace>
+      <namespace key="3" case="first-letter">User talk</namespace>
+      <namespace key="4" case="first-letter">Text Wiki</namespace>
+      <namespace key="5" case="first-letter">Text Wiki talk</namespace>
+      <namespace key="6" case="first-letter">File</namespace>
+      <namespace key="7" case="first-letter">File talk</namespace>
+      <namespace key="8" case="first-letter">MediaWiki</namespace>
+      <namespace key="9" case="first-letter">MediaWiki talk</namespace>
+      <namespace key="10" case="first-letter">Template</namespace>
+      <namespace key="11" case="first-letter">Template talk</namespace>
+      <namespace key="12" case="first-letter">Help</namespace>
+      <namespace key="13" case="first-letter">Help talk</namespace>
+      <namespace key="14" case="first-letter">Category</namespace>
+      <namespace key="15" case="first-letter">Category talk</namespace>
+      <namespace key="110" case="first-letter">Forum</namespace>
+      <namespace key="111" case="first-letter">Forum talk</namespace>
+      <namespace key="420" case="first-letter">GeoJson</namespace>
+      <namespace key="421" case="first-letter">GeoJson talk</namespace>
+      <namespace key="500" case="first-letter">User blog</namespace>
+      <namespace key="501" case="first-letter">User blog comment</namespace>
+      <namespace key="502" case="first-letter">Blog</namespace>
+      <namespace key="503" case="first-letter">Blog talk</namespace>
+      <namespace key="710" case="first-letter">TimedText</namespace>
+      <namespace key="711" case="first-letter">TimedText talk</namespace>
+      <namespace key="828" case="first-letter">Module</namespace>
+      <namespace key="829" case="first-letter">Module talk</namespace>
+      <namespace key="1200" case="first-letter">Message Wall</namespace>
+      <namespace key="1201" case="first-letter">Thread</namespace>
+      <namespace key="1202" case="first-letter">Message Wall Greeting</namespace>
+      <namespace key="2000" case="first-letter">Board</namespace>
+      <namespace key="2001" case="first-letter">Board Thread</namespace>
+      <namespace key="2002" case="first-letter">Topic</namespace>
+      <namespace key="2900" case="first-letter">Map</namespace>
+      <namespace key="2901" case="first-letter">Map talk</namespace>
+    </namespaces>
+  </siteinfo>
+  <page>
+    <title>Whiskers the Cat</title>
+    <ns>0</ns>
+    <id>190</id>
+    <revision>
+      <id>14802</id>
+      <parentid>14312</parentid>
+      <timestamp>2022-04-30T04:37:40Z</timestamp>
+      <contributor>
+        <username>Test user</username>
+        <id>47482455</id>
+      </contributor>
+      <minor/>
+      <comment>/* External links */Unicode+Fixes</comment>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+      <text bytes="233" sha1="qa5cny8ozb0vw4ahvoxevpe0u8f03lw" xml:space="preserve">
+      {{Standard Animal
+|title = Whiskers the Cat
+|image =whiskers.jpg
+|full_name = Sir Whiskers Whiskerington Whiskey Wiskerton III
+|birth = May 9, 2018&lt;br&gt;Portland, Maine, U.S.
+|age = {{Age|2018|5|09}}
+|gender = Male
+|nationality = American
+|occupation = Cat
+}}
+'''Whiskers the Cat '''is an American cat. 
+
+== Gallery ==
+&lt;gallery widths="200" spacing="small" position="left" captionalign="left" hideaddbutton="true"&gt;
+whiskers-on-the-floor.jpg|Whiskers is known for taking long naps
+&lt;/gallery&gt;
+
+== Appearance and Personality ==
+
+Whiskers is a medium-sized cat with a soft and fluffy coat. Its fur is predominantly white, with patches of orange and gray, giving it a unique and eye-catching appearance. But what truly sets Whiskers apart are its long, elegant whiskers that frame its adorable face.
+
+Aside from its adorable appearance, Whiskers is known for its friendly and sociable personality. This cat is extremely affectionate and loves to curl up on laps for a cozy cuddle. It enjoys playing with toys, chasing laser pointers, and exploring its surroundings with curiosity.
+
+== Popularity ==
+
+Whiskers rose to fame through social media platforms, where its adorable pictures and videos quickly went viral. Many internet users were captivated by its cuteness and endearing antics, leading to a dedicated following of fans and admirers.
+
+== Legacy ==
+
+Whiskers has become an iconic internet sensation, symbolizing the charm and playfulness of cats. It has inspired countless memes, fan art, and even merchandise. Whiskers continues to bring joy and happiness to people around the world, reminding us of the beauty and companionship that pets can bring into our lives.
+
+== References ==
+{{Reflist}}
+
+
+== External links ==
+* [[Wikipedia:Cat]] on Wikipedia
+
+{{WikiNavbox}}
+[[Category:Cats]]</text>
+      <sha1>qa5cny8ozb0vw4ahvoxevpe0u8f03lw</sha1>
+    </revision>
+  </page>
+  <page>
+    <title>File:TestPics.jpg</title>
+    <ns>6</ns>
+    <id>123</id>
+    <revision>
+      <id>14331</id>
+      <parentid>14802</parentid>
+      <timestamp>2023-04-17T16:39:12Z</timestamp>
+      <contributor>
+        <username>user</username>
+        <id>123456</id>
+      </contributor>
+      <minor/>
+      <comment>user moved page [[File:920x920.jpg]] to [[File:test.jpg]] for reasons</comment>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+      <text bytes="25" sha1="1diptt6cjtef34rewxvcmnduax281kz40p" xml:space="preserve">
+[[Category:Test images]]</text>
+      <sha1>1diptt6cjter6axvcmnduax281kz40p</sha1>
+    </revision>
+  </page>
+  <page>
+    <title>Gallagher</title>
+    <ns>0</ns>
+    <id>789</id>
+    <revision>
+      <id></id>
+      <parentid>14312</parentid>
+      <timestamp>2022-04-30T04:37:40Z</timestamp>
+      <contributor>
+        <username>Test User</username>
+        <id>474112455</id>
+      </contributor>
+      <minor/>
+      <comment>/* External links */Unicode+Fixes</comment>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+      <text bytes="942" sha1="qa5cny8ozb0vw4ahvoxevpe0u8f03lw" xml:space="preserve">
+      {{Standard Animal
+|title = Gallagher
+|image =gallagher-hockey.jpg
+|full_name = Gallager Golden
+|birth = November 13, 1994&lt;br&gt;Boston, Massachusetts, U.S.
+|age = {{Age|1994|11|13}}
+|gender = Male
+|nationality = American
+|occupation = Professional Hockey Player
+}}
+== Gallagher (Golden Retriever) ==
+
+[[File:Gallagher the Golden Retriever.jpg|thumb|Gallagher, the talented golden retriever]]
+
+'''Gallagher''' is a golden retriever who gained widespread attention as a professional hockey player for the Boston Bruins during the 1997-1998 season. This remarkable canine athlete captivated audiences with its exceptional skills on the ice and became an icon in the world of sports.
+
+== Career ==
+
+Gallagher's journey to becoming a professional hockey player began when it was discovered by Bruins' scouts during a charity event. Impressed by its agility, speed, and natural talent for puck handling, the team decided to sign Gallagher as an honorary member.
+
+During the 1997-1998 season, Gallagher proved to be a remarkable asset to the Bruins. Although unconventional, its ability to navigate the rink, steal the puck, and assist in scoring goals astonished both teammates and opponents alike. Despite the challenges of being a dog in a human-dominated sport, Gallagher's determination and love for the game were unmatched.
+
+== Statistics ==
+
+Throughout the 1997-1998 season, Gallagher's statistics were nothing short of extraordinary:
+
+- Goals: 25
+- Assists: 40
+- Penalties: 10
+
+These impressive numbers placed Gallagher among the league's top performers and solidified its status as an unforgettable player.
+
+== Legacy ==
+
+Gallagher's incredible achievements as a golden retriever hockey player left an indelible mark on the sport. It inspired a generation of fans and players alike, proving that dedication, teamwork, and a little bit of canine charm can overcome any obstacle.
+
+While Gallagher's hockey career was limited to a single season, its impact transcended the game. The dog's popularity skyrocketed, leading to numerous endorsements, appearances in movies and commercials, and even a line of dog-themed hockey merchandise.
+
+== References ==
+{{Reflist}}
+[[Category: Animals]]
+</text>
+      <sha1>qa5cny9y86ftfuyyuoxevpe0u8f03lw</sha1>
+    </revision>
+  </page>
+<page>
+    <title>Sir Whiskers Whiskerington Whiskey Wiskerton III</title>
+    <ns>0</ns>
+    <id>101</id>
+    <redirect title="Whiskers" />
+    <revision>
+      <id>455</id>
+      <timestamp>2018-10-17T04:10:06Z</timestamp>
+      <contributor>
+        <username>Test User</username>
+        <id>35032394</id>
+      </contributor>
+      <comment>User moved page [[Sir Whiskers Whiskerington Whiskey Wiskerton III]] to [[Whiskers]]</comment>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+      <text bytes="22" sha1="e854tjjb9a233fsdfs6bp1ve4v8g2qnkma" xml:space="preserve">#REDIRECT [[Whiskers]]</text>
+      <sha1>e854tjjb9a233fsdfs6bp1ve4v8g2qnkma</sha1>
+    </revision>
+  </page>
+</mediawiki>
--- a/tests/unit_tests/document_loaders/test_mediawikidump.py
+++ b/tests/unit_tests/document_loaders/test_mediawikidump.py
@ -0,0 +1,48 @@
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders.mediawikidump import MWDumpLoader
+
+PARENT_DIR = Path(__file__).parent / "sample_documents"
+
+
+@pytest.mark.requires("mwparserfromhell", "mwxml")
+def test_loading_flawed_xml() -> None:
+    loader = MWDumpLoader((PARENT_DIR / "mwtest_current_pages.xml").absolute())
+    with pytest.raises(TypeError):
+        loader.load()
+
+
+@pytest.mark.requires("mwparserfromhell", "mwxml")
+def test_skipping_errors() -> None:
+    loader = MWDumpLoader(
+        file_path=(PARENT_DIR / "mwtest_current_pages.xml").absolute(),
+        stop_on_error=False,
+    )
+    documents = loader.load()
+    assert len(documents) == 3
+
+
+@pytest.mark.requires("mwparserfromhell", "mwxml")
+def test_skipping_redirects() -> None:
+    loader = MWDumpLoader(
+        file_path=(PARENT_DIR / "mwtest_current_pages.xml").absolute(),
+        skip_redirects=True,
+        stop_on_error=False,
+    )
+    documents = loader.load()
+    assert len(documents) == 2
+
+
+@pytest.mark.requires("mwparserfromhell", "mwxml")
+def test_multiple_namespaces() -> None:
+    loader = MWDumpLoader(
+        file_path=(PARENT_DIR / "mwtest_current_pages.xml").absolute(),
+        namespaces=[0, 6],
+        skip_redirects=True,
+        stop_on_error=False,
+    )
+    documents = loader.load()
+    [print(doc) for doc in documents]
+    assert len(documents) == 2