Harrison/media wiki xml (#4072)

Co-authored-by: Géraud de Drouas <gdedrouas@users.noreply.github.com>
1 year ago · 5a269d3175
parent c186f18aab
commit 5a269d3175
4 changed files with 4939 additions and 0 deletions
--- a/docs/modules/indexes/document_loaders/examples/example_data/testmw_pages_current.xml
+++ b/docs/modules/indexes/document_loaders/examples/example_data/testmw_pages_current.xml
--- a/docs/modules/indexes/document_loaders/examples/mediawikidump.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/mediawikidump.ipynb
@ -0,0 +1,122 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MediaWikiDump\n",
+    "\n",
+    "This covers how to load a MediaWiki XML dump file into a document format that we can use downstream.\n",
+    "\n",
+    "It uses mwxml from mediawiki-utilities to dump and mwparserfromhell from earwig to parse MediaWiki wikicode.\n",
+    "\n",
+    "Dump files can be obtained with dumpBackup.php or on the Special:Statistics page of the Wiki."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "IXigDil0pANf"
+   },
+   "outputs": [],
+   "source": [
+    "#mediawiki-utilities supports XML schema 0.11 in unmerged branches\n",
+    "!pip install -qU git+https://github.com/mediawiki-utilities/python-mwtypes@updates_schema_0.11\n",
+    "#mediawiki-utilities mwxml has a bug, fix PR pending\n",
+    "!pip install -qU git+https://github.com/gdedrouas/python-mwxml@xml_format_0.11\n",
+    "!pip install -qU mwparserfromhell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "8-vB5XGHsE85"
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import MWDumpLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "i6e42MSkqEeH"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "You have 177 document(s) in your data \n"
+     ]
+    }
+   ],
+   "source": [
+    "loader = MWDumpLoader(\"example_data/testmw_pages_current.xml\", encoding=\"utf8\")\n",
+    "documents = loader.load()\n",
+    "print (f'You have {len(documents)} document(s) in your data ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "C2qbBVrjFK_H"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='\\t\\n\\t\\n\\tArtist\\n\\tReleased\\n\\tRecorded\\n\\tLength\\n\\tLabel\\n\\tProducer', metadata={'source': 'Album'}),\n",
+       " Document(page_content='{| class=\"article-table plainlinks\" style=\"width:100%;\"\\n|- style=\"font-size:18px;\"\\n! style=\"padding:0px;\" | Template documentation\\n|-\\n| Note: portions of the template sample may not be visible without values provided.\\n|-\\n| View or edit this documentation. (About template documentation)\\n|-\\n| Editors can experiment in this template\\'s [ sandbox] and [ test case] pages.\\n|}Category:Documentation templates', metadata={'source': 'Documentation'}),\n",
+       " Document(page_content='Description\\nThis template is used to insert descriptions on template pages.\\n\\nSyntax\\nAdd <noinclude></noinclude> at the end of the template page.\\n\\nAdd <noinclude></noinclude> to transclude an alternative page from the /doc subpage.\\n\\nUsage\\n\\nOn the Template page\\nThis is the normal format when used:\\n\\nTEMPLATE CODE\\n<includeonly>Any categories to be inserted into articles by the template</includeonly>\\n<noinclude>{{Documentation}}</noinclude>\\n\\nIf your template is not a completed div or table, you may need to close the tags just before {{Documentation}} is inserted (within the noinclude tags).\\n\\nA line break right before {{Documentation}} can also be useful as it helps prevent the documentation template \"running into\" previous code.\\n\\nOn the documentation page\\nThe documentation page is usually located on the /doc subpage for a template, but a different page can be specified with the first parameter of the template (see Syntax).\\n\\nNormally, you will want to write something like the following on the documentation page:\\n\\n==Description==\\nThis template is used to do something.\\n\\n==Syntax==\\nType <code>{{t|templatename}}</code> somewhere.\\n\\n==Samples==\\n<code><nowiki>{{templatename|input}}</nowiki></code> \\n\\nresults in...\\n\\n{{templatename|input}}\\n\\n<includeonly>Any categories for the template itself</includeonly>\\n<noinclude>[[Category:Template documentation]]</noinclude>\\n\\nUse any or all of the above description/syntax/sample output sections. You may also want to add \"see also\" or other sections.\\n\\nNote that the above example also uses the Template:T template.\\n\\nCategory:Documentation templatesCategory:Template documentation', metadata={'source': 'Documentation/doc'}),\n",
+       " Document(page_content='Description\\nA template link with a variable number of parameters (0-20).\\n\\nSyntax\\n \\n\\nSource\\nImproved version not needing t/piece subtemplate developed on Templates wiki see the list of authors. Copied here via CC-By-SA 3.0 license.\\n\\nExample\\n\\nCategory:General wiki templates\\nCategory:Template documentation', metadata={'source': 'T/doc'}),\n",
+       " Document(page_content='\\t\\n\\t\\t    \\n\\t\\n\\t\\t    Aliases\\n\\t    Relatives\\n\\t    Affiliation\\n        Occupation\\n    \\n            Biographical information\\n        Marital status\\n    \\tDate of birth\\n        Place of birth\\n        Date of death\\n        Place of death\\n    \\n            Physical description\\n        Species\\n        Gender\\n        Height\\n        Weight\\n        Eye color\\n\\t\\n           Appearances\\n       Portrayed by\\n       Appears in\\n       Debut\\n    ', metadata={'source': 'Character'})]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -46,6 +46,7 @@ from langchain.document_loaders.image import UnstructuredImageLoader
 from langchain.document_loaders.image_captions import ImageCaptionLoader
 from langchain.document_loaders.imsdb import IMSDbLoader
 from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
+from langchain.document_loaders.mediawikidump import MWDumpLoader
 from langchain.document_loaders.modern_treasury import ModernTreasuryLoader
 from langchain.document_loaders.notebook import NotebookLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
@ -142,6 +143,7 @@ __all__ = [
    "IMSDbLoader",
    "ImageCaptionLoader",
    "ModernTreasuryLoader",
+    "MWDumpLoader",
    "NotebookLoader",
    "NotionDBLoader",
    "NotionDirectoryLoader",
--- a/langchain/document_loaders/mediawikidump.py
+++ b/langchain/document_loaders/mediawikidump.py
@ -0,0 +1,57 @@
+"""Load Data from a MediaWiki dump xml."""
+from typing import List, Optional
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class MWDumpLoader(BaseLoader):
+    """
+    Load MediaWiki dump from XML file
+    Example:
+        .. code-block:: python
+
+            from langchain.document_loaders import MWDumpLoader
+
+            loader = MWDumpLoader(
+                file_path="myWiki.xml",
+                encoding="utf8"
+            )
+            docs = loader.load()
+            from langchain.text_splitter import RecursiveCharacterTextSplitter
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=1000, chunk_overlap=0
+            )
+            texts = text_splitter.split_documents(docs)
+
+
+    :param file_path: XML local file path
+    :type file_path: str
+    :param encoding: Charset encoding, defaults to "utf8"
+    :type encoding: str, optional
+    """
+
+    def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
+        """Initialize with file path."""
+        self.file_path = file_path
+        self.encoding = encoding
+
+    def load(self) -> List[Document]:
+        """Load from file path."""
+        import mwparserfromhell
+        import mwxml
+
+        dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
+
+        docs = []
+
+        for page in dump.pages:
+            for revision in page:
+                code = mwparserfromhell.parse(revision.text)
+                text = code.strip_code(
+                    normalize=True, collapse=True, keep_template_params=False
+                )
+                metadata = {"source": page.title}
+                docs.append(Document(page_content=text, metadata=metadata))
+
+        return docs