MediaWiki docloader improvements + unit tests (#5879)

Starting over from #5654 because I utterly borked the poetry.lock file.

Adds new paramerters for to the MWDumpLoader class:

* skip_redirecst (bool) Tells the loader to skip articles that redirect
to other articles. False by default.
* stop_on_error (bool) Tells the parser to skip any page that causes a
parse error. True by default.
* namespaces (List[int]) Tells the parser which namespaces to parse.
Contains namespaces from -2 to 15 by default.

Default values are chosen to preserve backwards compatibility.

Sample dump XML and full unit test coverage (with extended tests that
pass!) also included!

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/7755/head
Gordon Clark 11 months ago committed by GitHub
parent 4c8106311f
commit 96f3dff050
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,9 +1,13 @@
"""Load Data from a MediaWiki dump xml."""
from typing import List, Optional
import logging
from pathlib import Path
from typing import List, Optional, Sequence, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class MWDumpLoader(BaseLoader):
"""
@ -29,34 +33,64 @@ class MWDumpLoader(BaseLoader):
:type file_path: str
:param encoding: Charset encoding, defaults to "utf8"
:type encoding: str, optional
:param namespaces: The namespace of pages you want to parse.
See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
for a list of all common namespaces
:type namespaces: List[int],optional
:param skip_redirects: TR=rue to skip pages that redirect to other pages,
False to keep them. False by default
:type skip_redirects: bool, optional
:param stop_on_error: False to skip over pages that cause parsing errors,
True to stop. True by default
:type stop_on_error: bool, optional
"""
def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
"""Initialize with a file path.
Args:
file_path: XML local file path
encoding: Charset encoding, defaults to "utf8"
"""
self.file_path = file_path
def __init__(
self,
file_path: Union[str, Path],
encoding: Optional[str] = "utf8",
namespaces: Optional[Sequence[int]] = None,
skip_redirects: Optional[bool] = False,
stop_on_error: Optional[bool] = True,
):
self.file_path = file_path if isinstance(file_path, str) else str(file_path)
self.encoding = encoding
# Namespaces range from -2 to 15, inclusive.
self.namespaces = namespaces or list(range(-2, 16))
self.skip_redirects = skip_redirects
self.stop_on_error = stop_on_error
def load(self) -> List[Document]:
"""Load from a file path."""
import mwparserfromhell
import mwxml
try:
import mwparserfromhell
import mwxml
except ImportError as e:
raise ImportError(
"Unable to import 'mwparserfromhell' or 'mwxml'. Please install with"
" `pip install mwparserfromhell mwxml`."
) from e
dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
docs = []
for page in dump.pages:
for revision in page:
code = mwparserfromhell.parse(revision.text)
text = code.strip_code(
normalize=True, collapse=True, keep_template_params=False
)
metadata = {"source": page.title}
docs.append(Document(page_content=text, metadata=metadata))
if self.skip_redirects and page.redirect:
continue
if page.namespace not in self.namespaces:
continue
try:
for revision in page:
code = mwparserfromhell.parse(revision.text)
text = code.strip_code(
normalize=True, collapse=True, keep_template_params=False
)
metadata = {"source": page.title}
docs.append(Document(page_content=text, metadata=metadata))
except Exception as e:
logger.error("Parsing error: {}".format(e))
if self.stop_on_error:
raise e
else:
continue
return docs

138
poetry.lock generated

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry and should not be changed by hand.
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
[[package]]
name = "absl-py"
@ -2402,6 +2402,17 @@ websocket-client = ">=0.32.0"
[package.extras]
ssh = ["paramiko (>=2.4.3)"]
[[package]]
name = "docopt"
version = "0.6.2"
description = "Pythonic argument parser, that will make you smile"
category = "main"
optional = true
python-versions = "*"
files = [
{file = "docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"},
]
[[package]]
name = "docutils"
version = "0.17.1"
@ -4358,6 +4369,18 @@ files = [
{file = "jq-1.4.1.tar.gz", hash = "sha256:52284ee3cb51670e6f537b0ec813654c064c1c0705bd910097ea0fe17313516d"},
]
[[package]]
name = "jsonable"
version = "0.3.1"
description = "An abstract class that supports jsonserialization/deserialization."
category = "main"
optional = true
python-versions = "*"
files = [
{file = "jsonable-0.3.1-py2.py3-none-any.whl", hash = "sha256:f7754dd27b4734e42e7f8a61c2336bc98082f715e31e29a061a95843b102dc3a"},
{file = "jsonable-0.3.1.tar.gz", hash = "sha256:137b676e8e5819fa58518678c3d1f5463cab7e8466f69b3641cbc438042eaee4"},
]
[[package]]
name = "jsonlines"
version = "3.1.0"
@ -4382,7 +4405,6 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
]
[[package]]
@ -5654,6 +5676,94 @@ files = [
{file = "murmurhash-1.0.9.tar.gz", hash = "sha256:fe7a38cb0d3d87c14ec9dddc4932ffe2dbc77d75469ab80fd5014689b0e07b58"},
]
[[package]]
name = "mwcli"
version = "0.0.3"
description = "Utilities for processing MediaWiki on the command line."
category = "main"
optional = true
python-versions = "*"
files = [
{file = "mwcli-0.0.3-py2.py3-none-any.whl", hash = "sha256:24a7e53730e6fa7e55626e4f2a61a0b016d5e0a9798306c1d8c71bcead0ab239"},
{file = "mwcli-0.0.3.tar.gz", hash = "sha256:00331bd0ff16b5721c9c6274d91e25fd355f45ec0773c8a0e3926eac058719a0"},
]
[package.dependencies]
docopt = "*"
mwxml = "*"
para = "*"
[[package]]
name = "mwparserfromhell"
version = "0.6.4"
description = "MWParserFromHell is a parser for MediaWiki wikicode."
category = "main"
optional = true
python-versions = ">= 3.6"
files = [
{file = "mwparserfromhell-0.6.4-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:fc4f5718e761a3f5ad76eb9089e0792ed3a6786095abe098e37e7ac7af76afef"},
{file = "mwparserfromhell-0.6.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7f19e7d064c467f32e0704becd81c841a807335934134d6aa859d98d01c7cf3"},
{file = "mwparserfromhell-0.6.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:511ff847cddb8e7014b6afb0af5dbdb5cf05ada67e31fc39efa34fbbdccb8e8b"},
{file = "mwparserfromhell-0.6.4-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:60d86c8d3501edc1331b37df72b74689ee392da077c36a8b453460b8e3714cdd"},
{file = "mwparserfromhell-0.6.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54d9819c11530fc00b8a70fa3508898109b3df72336f7b8e52f8faffbe03ee88"},
{file = "mwparserfromhell-0.6.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d983914c19dee5c2a13298b1ccd3a1ed2b65c81d322b7e7df99cd5386a460c6"},
{file = "mwparserfromhell-0.6.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1c908c9738c5c6bce04b825b3f95592d971ff439ace294a86fc758070afc6d0c"},
{file = "mwparserfromhell-0.6.4-cp36-cp36m-win32.whl", hash = "sha256:abae1052b9c12a8814c76dd26ec9cfdd71102e7f89c28fb58a5fba7ee55ad1bc"},
{file = "mwparserfromhell-0.6.4-cp36-cp36m-win_amd64.whl", hash = "sha256:f8c450c39ef647678831ecf9a1f8236521d369afc4ae59a9c601d07f298eda35"},
{file = "mwparserfromhell-0.6.4-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:5dfd7f57fa3d516b21790ef7f6094119082baa2e6072cef78fb9f999b77e674f"},
{file = "mwparserfromhell-0.6.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd0a74474ed6e85808c874511d28a253ffd2d1e5a3abe915705a25804212ac73"},
{file = "mwparserfromhell-0.6.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a344ceabde013aa2f9b23494e73af11b99795f63a30124e955c185de2c8ae397"},
{file = "mwparserfromhell-0.6.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:82010e5b5da130cbcb002747f5592ffca73488e0e9cf1ebdfef6e8559c535c41"},
{file = "mwparserfromhell-0.6.4-cp37-cp37m-win32.whl", hash = "sha256:2d6e124396ee41c35ea12017a66c560abb1f7f51bee04e631a149318adaf15e2"},
{file = "mwparserfromhell-0.6.4-cp37-cp37m-win_amd64.whl", hash = "sha256:0519497b8a7472298324ef92e1e82c1ab5cab85b4d64462d7ae46c4464c8b872"},
{file = "mwparserfromhell-0.6.4-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:dd81220d66bf829664a6f911b3f58e7af3061fd7fdee68c0fc9731f5bcd7519d"},
{file = "mwparserfromhell-0.6.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:203ad9cd78dec7480fde45c9f49d0bc2a2eaa28fa1b585461fb9f56f6587f46c"},
{file = "mwparserfromhell-0.6.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d63d76f576e133c14a29f1ad2f3fc2afa17b74945ebc017e8d7d3bcb59f5243c"},
{file = "mwparserfromhell-0.6.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7f5682ab9e1a55b20e9fb669582493d196c76a512276456848153c39d726d7d2"},
{file = "mwparserfromhell-0.6.4-cp38-cp38-win32.whl", hash = "sha256:e0c3d3bc409f8ac1221639ee2dab0dc830711d9a56a39014aad2824c2c98b3e2"},
{file = "mwparserfromhell-0.6.4-cp38-cp38-win_amd64.whl", hash = "sha256:18dac5162471c38e5bbf6ee3698c49d2753e8dde372864112fdaf81047ce89d3"},
{file = "mwparserfromhell-0.6.4-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:7c822985760b9e82857ecfb99dbb60ac35d0ebf7b2977a0215c7c56fe70c2b68"},
{file = "mwparserfromhell-0.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d521a2e3787c83ecf607a7806ae655d32f3c3884b2dcf35a388183c6028ddce4"},
{file = "mwparserfromhell-0.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42eb94a6bad20b7f8845fd2900a45373cb4d414d5a357b27457c7c6c259115c5"},
{file = "mwparserfromhell-0.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:da2568e2a492dcc913f1c026434371af7a24e05dc56450c3ab063d9e580b48f2"},
{file = "mwparserfromhell-0.6.4-cp39-cp39-win32.whl", hash = "sha256:3e5f4bb96b68557acd14c4baa62cbe440b6e6d0f5263cb4860d37e1ceeada2a7"},
{file = "mwparserfromhell-0.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:0c2bbb36110410b5f6d6d8b2f35f65f8ec8f57c0477609d35bcaac4784a59e5a"},
{file = "mwparserfromhell-0.6.4.tar.gz", hash = "sha256:92bec9528ae34d272893ccaf2b527df85c314ff28cfbb3056340467b095d834c"},
]
[[package]]
name = "mwtypes"
version = "0.3.2"
description = "A set of types for processing MediaWiki data."
category = "main"
optional = true
python-versions = "*"
files = [
{file = "mwtypes-0.3.2-py2.py3-none-any.whl", hash = "sha256:d6f3cae90eea4c88bc260101c8a082fb0ab22cca88e7474657b28cd9538794f3"},
{file = "mwtypes-0.3.2.tar.gz", hash = "sha256:dc1176c5965629c123e859b319ae6151d4e385531e9a781604c0d4ca3434e399"},
]
[package.dependencies]
jsonable = ">=0.3.0"
[[package]]
name = "mwxml"
version = "0.3.3"
description = "A set of utilities for processing MediaWiki XML dump data."
category = "main"
optional = true
python-versions = "*"
files = [
{file = "mwxml-0.3.3-py2.py3-none-any.whl", hash = "sha256:9695848b8b6987b6f6addc2a8accba5b2bcbc543702598194e182b508ab568a9"},
{file = "mwxml-0.3.3.tar.gz", hash = "sha256:0848df0cf2e293718f554311acf4715bd679f639f4e52cbe47d8206589db1d31"},
]
[package.dependencies]
jsonschema = ">=2.5.1"
mwcli = ">=0.0.2"
mwtypes = ">=0.3.0"
para = ">=0.0.1"
[[package]]
name = "mypy"
version = "0.991"
@ -7061,6 +7171,18 @@ files = [
{file = "pandocfilters-1.5.0.tar.gz", hash = "sha256:0b679503337d233b4339a817bfc8c50064e2eff681314376a47cb582305a7a38"},
]
[[package]]
name = "para"
version = "0.0.8"
description = "a set utilities that ake advantage of python's 'multiprocessing' module to distribute CPU-intensive tasks"
category = "main"
optional = true
python-versions = "*"
files = [
{file = "para-0.0.8-py3-none-any.whl", hash = "sha256:c63b030658cafd84f8fabfc000142324d51c7440e50ef5012fd1a54972ca25f4"},
{file = "para-0.0.8.tar.gz", hash = "sha256:46c3232ae9d8ea9d886cfd08cdd112892202bed8645f40b6255597ba4cfef217"},
]
[[package]]
name = "parso"
version = "0.8.3"
@ -11445,7 +11567,7 @@ files = [
]
[package.dependencies]
accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\""}
accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\" or extra == \"torch\""}
filelock = "*"
huggingface-hub = ">=0.14.1,<1.0"
numpy = ">=1.17"
@ -12702,15 +12824,15 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
all = ["anthropic", "clarifai", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "marqo", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "libdeeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python", "awadb", "esprima", "octoai-sdk", "rdflib"]
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-search-documents"]
all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "libdeeplake", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "octoai-sdk", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "spacy", "steamship", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"]
clarifai = ["clarifai"]
cohere = ["cohere"]
docarray = ["docarray"]
embeddings = ["sentence-transformers"]
extended-testing = ["beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "jq", "pdfminer-six", "pgvector", "pypdf", "pymupdf", "pypdfium2", "tqdm", "lxml", "atlassian-python-api", "beautifulsoup4", "pandas", "telethon", "psychicapi", "zep-python", "gql", "requests-toolbelt", "html2text", "py-trello", "scikit-learn", "streamlit", "pyspark", "openai", "sympy", "rapidfuzz"]
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "gql", "html2text", "jq", "lxml", "mwparserfromhell", "mwxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "zep-python"]
javascript = ["esprima"]
llms = ["anthropic", "clarifai", "cohere", "openai", "openllm", "openlm", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers"]
openai = ["openai", "tiktoken"]
qdrant = ["qdrant-client"]
text-helpers = ["chardet"]
@ -12718,4 +12840,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "7c3eeaa43dead997a66d01a3ba3799656d216d20011329f6a14fcf653cc658b7"
content-hash = "14b61a9483499285cf02b991eb176a540d76704c6859c525827e8e271e106c4a"

@ -108,6 +108,8 @@ pyspark = {version = "^3.4.0", optional = true}
clarifai = {version = ">=9.1.0", optional = true}
tigrisdb = {version = "^1.0.0b6", optional = true}
nebula3-python = {version = "^3.4.0", optional = true}
mwparserfromhell = {version = "^0.6.4", optional = true}
mwxml = {version = "^0.3.3", optional = true}
awadb = {version = "^0.3.3", optional = true}
azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
esprima = {version = "^4.0.1", optional = true}
@ -343,7 +345,8 @@ extended_testing = [
"tqdm",
"lxml",
"atlassian-python-api",
"beautifulsoup4",
"mwparserfromhell",
"mwxml",
"pandas",
"telethon",
"psychicapi",

@ -0,0 +1,211 @@
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="en">
<siteinfo>
<sitename>Text Wiki</sitename>
<dbname>test123</dbname>
<base>http://control.fandom.com/wiki/Control_Wiki</base>
<generator>MediaWiki 1.37.3</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
<namespace key="-1" case="first-letter">Special</namespace>
<namespace key="0" case="first-letter" />
<namespace key="1" case="first-letter">Talk</namespace>
<namespace key="2" case="first-letter">User</namespace>
<namespace key="3" case="first-letter">User talk</namespace>
<namespace key="4" case="first-letter">Text Wiki</namespace>
<namespace key="5" case="first-letter">Text Wiki talk</namespace>
<namespace key="6" case="first-letter">File</namespace>
<namespace key="7" case="first-letter">File talk</namespace>
<namespace key="8" case="first-letter">MediaWiki</namespace>
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
<namespace key="10" case="first-letter">Template</namespace>
<namespace key="11" case="first-letter">Template talk</namespace>
<namespace key="12" case="first-letter">Help</namespace>
<namespace key="13" case="first-letter">Help talk</namespace>
<namespace key="14" case="first-letter">Category</namespace>
<namespace key="15" case="first-letter">Category talk</namespace>
<namespace key="110" case="first-letter">Forum</namespace>
<namespace key="111" case="first-letter">Forum talk</namespace>
<namespace key="420" case="first-letter">GeoJson</namespace>
<namespace key="421" case="first-letter">GeoJson talk</namespace>
<namespace key="500" case="first-letter">User blog</namespace>
<namespace key="501" case="first-letter">User blog comment</namespace>
<namespace key="502" case="first-letter">Blog</namespace>
<namespace key="503" case="first-letter">Blog talk</namespace>
<namespace key="710" case="first-letter">TimedText</namespace>
<namespace key="711" case="first-letter">TimedText talk</namespace>
<namespace key="828" case="first-letter">Module</namespace>
<namespace key="829" case="first-letter">Module talk</namespace>
<namespace key="1200" case="first-letter">Message Wall</namespace>
<namespace key="1201" case="first-letter">Thread</namespace>
<namespace key="1202" case="first-letter">Message Wall Greeting</namespace>
<namespace key="2000" case="first-letter">Board</namespace>
<namespace key="2001" case="first-letter">Board Thread</namespace>
<namespace key="2002" case="first-letter">Topic</namespace>
<namespace key="2900" case="first-letter">Map</namespace>
<namespace key="2901" case="first-letter">Map talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>Whiskers the Cat</title>
<ns>0</ns>
<id>190</id>
<revision>
<id>14802</id>
<parentid>14312</parentid>
<timestamp>2022-04-30T04:37:40Z</timestamp>
<contributor>
<username>Test user</username>
<id>47482455</id>
</contributor>
<minor/>
<comment>/* External links */Unicode+Fixes</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<text bytes="233" sha1="qa5cny8ozb0vw4ahvoxevpe0u8f03lw" xml:space="preserve">
{{Standard Animal
|title = Whiskers the Cat
|image =whiskers.jpg
|full_name = Sir Whiskers Whiskerington Whiskey Wiskerton III
|birth = May 9, 2018&lt;br&gt;Portland, Maine, U.S.
|age = {{Age|2018|5|09}}
|gender = Male
|nationality = American
|occupation = Cat
}}
'''Whiskers the Cat '''is an American cat.
== Gallery ==
&lt;gallery widths="200" spacing="small" position="left" captionalign="left" hideaddbutton="true"&gt;
whiskers-on-the-floor.jpg|Whiskers is known for taking long naps
&lt;/gallery&gt;
== Appearance and Personality ==
Whiskers is a medium-sized cat with a soft and fluffy coat. Its fur is predominantly white, with patches of orange and gray, giving it a unique and eye-catching appearance. But what truly sets Whiskers apart are its long, elegant whiskers that frame its adorable face.
Aside from its adorable appearance, Whiskers is known for its friendly and sociable personality. This cat is extremely affectionate and loves to curl up on laps for a cozy cuddle. It enjoys playing with toys, chasing laser pointers, and exploring its surroundings with curiosity.
== Popularity ==
Whiskers rose to fame through social media platforms, where its adorable pictures and videos quickly went viral. Many internet users were captivated by its cuteness and endearing antics, leading to a dedicated following of fans and admirers.
== Legacy ==
Whiskers has become an iconic internet sensation, symbolizing the charm and playfulness of cats. It has inspired countless memes, fan art, and even merchandise. Whiskers continues to bring joy and happiness to people around the world, reminding us of the beauty and companionship that pets can bring into our lives.
== References ==
{{Reflist}}
== External links ==
* [[Wikipedia:Cat]] on Wikipedia
{{WikiNavbox}}
[[Category:Cats]]</text>
<sha1>qa5cny8ozb0vw4ahvoxevpe0u8f03lw</sha1>
</revision>
</page>
<page>
<title>File:TestPics.jpg</title>
<ns>6</ns>
<id>123</id>
<revision>
<id>14331</id>
<parentid>14802</parentid>
<timestamp>2023-04-17T16:39:12Z</timestamp>
<contributor>
<username>user</username>
<id>123456</id>
</contributor>
<minor/>
<comment>user moved page [[File:920x920.jpg]] to [[File:test.jpg]] for reasons</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<text bytes="25" sha1="1diptt6cjtef34rewxvcmnduax281kz40p" xml:space="preserve">
[[Category:Test images]]</text>
<sha1>1diptt6cjter6axvcmnduax281kz40p</sha1>
</revision>
</page>
<page>
<title>Gallagher</title>
<ns>0</ns>
<id>789</id>
<revision>
<id></id>
<parentid>14312</parentid>
<timestamp>2022-04-30T04:37:40Z</timestamp>
<contributor>
<username>Test User</username>
<id>474112455</id>
</contributor>
<minor/>
<comment>/* External links */Unicode+Fixes</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<text bytes="942" sha1="qa5cny8ozb0vw4ahvoxevpe0u8f03lw" xml:space="preserve">
{{Standard Animal
|title = Gallagher
|image =gallagher-hockey.jpg
|full_name = Gallager Golden
|birth = November 13, 1994&lt;br&gt;Boston, Massachusetts, U.S.
|age = {{Age|1994|11|13}}
|gender = Male
|nationality = American
|occupation = Professional Hockey Player
}}
== Gallagher (Golden Retriever) ==
[[File:Gallagher the Golden Retriever.jpg|thumb|Gallagher, the talented golden retriever]]
'''Gallagher''' is a golden retriever who gained widespread attention as a professional hockey player for the Boston Bruins during the 1997-1998 season. This remarkable canine athlete captivated audiences with its exceptional skills on the ice and became an icon in the world of sports.
== Career ==
Gallagher's journey to becoming a professional hockey player began when it was discovered by Bruins' scouts during a charity event. Impressed by its agility, speed, and natural talent for puck handling, the team decided to sign Gallagher as an honorary member.
During the 1997-1998 season, Gallagher proved to be a remarkable asset to the Bruins. Although unconventional, its ability to navigate the rink, steal the puck, and assist in scoring goals astonished both teammates and opponents alike. Despite the challenges of being a dog in a human-dominated sport, Gallagher's determination and love for the game were unmatched.
== Statistics ==
Throughout the 1997-1998 season, Gallagher's statistics were nothing short of extraordinary:
- Goals: 25
- Assists: 40
- Penalties: 10
These impressive numbers placed Gallagher among the league's top performers and solidified its status as an unforgettable player.
== Legacy ==
Gallagher's incredible achievements as a golden retriever hockey player left an indelible mark on the sport. It inspired a generation of fans and players alike, proving that dedication, teamwork, and a little bit of canine charm can overcome any obstacle.
While Gallagher's hockey career was limited to a single season, its impact transcended the game. The dog's popularity skyrocketed, leading to numerous endorsements, appearances in movies and commercials, and even a line of dog-themed hockey merchandise.
== References ==
{{Reflist}}
[[Category: Animals]]
</text>
<sha1>qa5cny9y86ftfuyyuoxevpe0u8f03lw</sha1>
</revision>
</page>
<page>
<title>Sir Whiskers Whiskerington Whiskey Wiskerton III</title>
<ns>0</ns>
<id>101</id>
<redirect title="Whiskers" />
<revision>
<id>455</id>
<timestamp>2018-10-17T04:10:06Z</timestamp>
<contributor>
<username>Test User</username>
<id>35032394</id>
</contributor>
<comment>User moved page [[Sir Whiskers Whiskerington Whiskey Wiskerton III]] to [[Whiskers]]</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<text bytes="22" sha1="e854tjjb9a233fsdfs6bp1ve4v8g2qnkma" xml:space="preserve">#REDIRECT [[Whiskers]]</text>
<sha1>e854tjjb9a233fsdfs6bp1ve4v8g2qnkma</sha1>
</revision>
</page>
</mediawiki>

@ -0,0 +1,48 @@
from pathlib import Path
import pytest
from langchain.document_loaders.mediawikidump import MWDumpLoader
PARENT_DIR = Path(__file__).parent / "sample_documents"
@pytest.mark.requires("mwparserfromhell", "mwxml")
def test_loading_flawed_xml() -> None:
loader = MWDumpLoader((PARENT_DIR / "mwtest_current_pages.xml").absolute())
with pytest.raises(TypeError):
loader.load()
@pytest.mark.requires("mwparserfromhell", "mwxml")
def test_skipping_errors() -> None:
loader = MWDumpLoader(
file_path=(PARENT_DIR / "mwtest_current_pages.xml").absolute(),
stop_on_error=False,
)
documents = loader.load()
assert len(documents) == 3
@pytest.mark.requires("mwparserfromhell", "mwxml")
def test_skipping_redirects() -> None:
loader = MWDumpLoader(
file_path=(PARENT_DIR / "mwtest_current_pages.xml").absolute(),
skip_redirects=True,
stop_on_error=False,
)
documents = loader.load()
assert len(documents) == 2
@pytest.mark.requires("mwparserfromhell", "mwxml")
def test_multiple_namespaces() -> None:
loader = MWDumpLoader(
file_path=(PARENT_DIR / "mwtest_current_pages.xml").absolute(),
namespaces=[0, 6],
skip_redirects=True,
stop_on_error=False,
)
documents = loader.load()
[print(doc) for doc in documents]
assert len(documents) == 2
Loading…
Cancel
Save