You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
wikiteam/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py

128 lines
4.6 KiB
Python

from lxml import etree
from lxml.builder import E
from wikiteam3.dumpgenerator.exceptions import PageMissingError
def makeXmlPageFromRaw(xml, arvcontinue) -> str:
"""Discard the metadata around a <page> element in <mediawiki> string"""
root = etree.XML(xml)
find = etree.XPath("//*[local-name() = 'page']")
page = find(root)[0]
if arvcontinue is not None:
page.attrib["arvcontinue"] = arvcontinue
# The tag will inherit the namespace, like:
# <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
# FIXME: pretty_print doesn't seem to work, only adds a newline
return etree.tostring(page, pretty_print=True, encoding="unicode")
def makeXmlFromPage(page: dict, arvcontinue) -> str:
"""Output an XML document as a string from a page as in the API JSON"""
try:
p = E.page(
E.title(str(page["title"])),
E.ns(str(page["ns"])),
E.id(str(page["pageid"])),
)
if arvcontinue is not None:
p.attrib["arvcontinue"] = arvcontinue
for rev in page["revisions"]:
# Older releases like MediaWiki 1.16 do not return all fields.
userid = rev["userid"] if "userid" in rev else 0
size = rev["size"] if "size" in rev else 0
# Create rev object
revision = [
E.id(str(rev["revid"])),
E.timestamp(rev["timestamp"]),
]
# The text, user, comment, sha1 may be deleted/suppressed
if ("texthidden" in rev) or ("textmissing" in rev):
print(
"Warning: text missing/hidden in pageid %d revid %d"
% (page["pageid"], rev["revid"])
)
revision.append(
E.text(
**{
"bytes": str(size),
"deleted": "deleted",
}
)
)
else:
text = str(rev["*"])
revision.append(
E.text(
text,
**{
"bytes": str(size),
"{http://www.w3.org/XML/1998/namespace}space": "preserve",
}
)
)
if "user" not in rev:
if "userhidden" not in rev:
print(
"Warning: user not hidden but missing user in pageid %d revid %d"
% (page["pageid"], rev["revid"])
)
revision.append(E.contributor(deleted="deleted"))
else:
revision.append(
E.contributor(
E.username(str(rev["user"])),
E.id(str(userid)),
)
)
if "sha1" in rev:
revision.append(E.sha1(rev["sha1"]))
elif "sha1hidden" in rev:
revision.append(E.sha1()) # stub
if "commenthidden" in rev:
revision.append(E.comment(deleted="deleted"))
elif "comment" in rev and rev["comment"]:
revision.append(E.comment(str(rev["comment"])))
if "contentmodel" in rev:
revision.append(E.model(rev["contentmodel"]))
if "contentformat" in rev:
revision.append(E.format(rev["contentformat"]))
# Sometimes a missing parentid is not replaced with a 0 as it should.
if "parentid" in rev:
revision.append(E.parentid(str(rev["parentid"])))
if "minor" in rev:
revision.append(E.minor())
# mwcli's dump.xml order
revisionTags = [
"id",
"parentid",
"timestamp",
"contributor",
"minor",
"comment",
"origin",
"model",
"format",
"text",
"sha1",
]
revisionElementsDict = {elem.tag: elem for elem in revision}
_revision = E.revision()
for tag in revisionTags:
if tag in revisionElementsDict:
_revision.append(revisionElementsDict.pop(tag))
for elem in revisionElementsDict.values():
_revision.append(elem)
p.append(_revision)
except KeyError as e:
print(e)
raise PageMissingError(page["title"], e)
return etree.tostring(p, pretty_print=True, encoding="unicode")