Refactor xml dump related codes and improve xmlrevision logic (#103)

pull/475/head
NyaMisty 1 year ago committed by GitHub
parent 0fb53ffdde
commit be983b0814
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -3,9 +3,10 @@ import sys
from urllib.parse import urlparse
import mwclient
from file_read_backwards import FileReadBackwards
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.dump.xmlrev.namespaces import getNamespacesAPI, getNamespacesScraper
from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI, getNamespacesScraper
from wikiteam3.utils import domain2prefix, cleanHTML, undoHTMLEntities
from wikiteam3.dumpgenerator.config import Config
@ -208,9 +209,32 @@ def getPageTitles(config: Config=None, session=None):
print("%d page titles loaded" % (c))
return titlesfilename
def checkTitleOk(config: Config=None, ):
try:
with FileReadBackwards(
"%s/%s-%s-titles.txt"
% (
config.path,
domain2prefix(config=config),
config.date,
),
encoding="utf-8",
) as frb:
lasttitle = frb.readline().strip()
if lasttitle == "":
lasttitle = frb.readline().strip()
except:
lasttitle = "" # probably file does not exists
def readTitles(config: Config=None, start=None, batch=False):
if lasttitle != "--END--":
return False
return True
def readTitles(config: Config=None, session=None, start=None, batch=False):
"""Read title list from a file, from the title "start" """
if not checkTitleOk(config):
getPageTitles(config=config)
titlesfilename = "{}-{}-titles.txt".format(
domain2prefix(config=config), config.date

@ -78,6 +78,11 @@ def getArgumentParser():
action="store_true",
help="download all revisions from an API generator. MediaWiki 1.27+ only.",
)
groupDownload.add_argument(
"--xmlrevisions_page",
action="store_true",
help="download all revisions from an API generator, but query page by page MediaWiki 1.27+ only.",
)
groupDownload.add_argument(
"--images", action="store_true", help="generates an image dump"
)
@ -91,6 +96,9 @@ def getArgumentParser():
metavar="1,2,3",
help="comma-separated value of namespaces to exclude",
)
parser.add_argument(
"--api_chunksize", metavar="50", default=50, help="Chunk size for MediaWiki API (arvlimit, ailimit, etc.)"
)
# Meta info params
groupMeta = parser.add_argument_group(
@ -141,7 +149,9 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
# Courtesy datashaman https://stackoverflow.com/a/35504626
__retries__ = Retry(
total=int(args.retries), backoff_factor=2, status_forcelist=[500, 502, 503, 504, 429]
total=int(args.retries), backoff_factor=2,
status_forcelist=[500, 502, 503, 504, 429],
allowed_methods=['DELETE', 'PUT', 'GET', 'OPTIONS', 'TRACE', 'HEAD', 'POST']
)
session.mount("https://", HTTPAdapter(max_retries=__retries__))
session.mount("http://", HTTPAdapter(max_retries=__retries__))
@ -288,11 +298,13 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
"api": api,
"failfast": args.failfast,
"http_method": "POST",
"api_chunksize": args.api_chunksize,
"index": index,
"images": args.images,
"logs": False,
"xml": args.xml,
"xmlrevisions": args.xmlrevisions,
"xmlrevisions": args.xmlrevisions or args.xmlrevisions_page,
"xmlrevisions_page": args.xmlrevisions_page,
"namespaces": namespaces,
"exnamespaces": exnamespaces,
"path": args.path and os.path.normpath(args.path) or "",

@ -53,10 +53,12 @@ class Config:
xml: bool = False
curonly: bool = False
xmlrevisions: bool = False
xmlrevisions_page: bool = False
images: bool = False
namespaces: List[int] = None
exnamespaces: List[int] = None
api_chunksize: int = 0 # arvlimit, ailimit, etc
export: str = '' # Special:Export page name
http_method: str = ''

@ -30,14 +30,13 @@ from wikiteam3.utils import truncateFilename
from wikiteam3.utils import undoHTMLEntities
from wikiteam3.utils import avoidWikimediaProjects
from .page.image import Image
from .misc.index_php import saveIndexPHP
from .misc.special_logs import saveLogs
from .misc.special_version import saveSpecialVersion
from .page.page_titles import getPageTitles, readTitles
from .misc.site_info import saveSiteInfo
from .xmlrev.xml_dump import generateXMLDump
from .xmlrev.xml_integrity import checkXMLIntegrity
from wikiteam3.dumpgenerator.dump.image.image import Image
from wikiteam3.dumpgenerator.dump.misc.index_php import saveIndexPHP
from wikiteam3.dumpgenerator.dump.misc.special_logs import saveLogs
from wikiteam3.dumpgenerator.dump.misc.special_version import saveSpecialVersion
from wikiteam3.dumpgenerator.dump.misc.site_info import saveSiteInfo
from wikiteam3.dumpgenerator.dump.xmldump.xml_dump import generateXMLDump
from wikiteam3.dumpgenerator.dump.xmldump.xml_integrity import checkXMLIntegrity
# From https://stackoverflow.com/a/57008707
class Tee(object):
@ -122,13 +121,12 @@ class DumpGenerator:
@staticmethod
def createNewDump(config: Config=None, other: Dict=None):
# we do lazy title dumping here :)
images = []
print("Trying generating a new dump into a new directory...")
if config.xml:
getPageTitles(config=config, session=other["session"])
titles = readTitles(config)
generateXMLDump(config=config, titles=titles, session=other["session"])
checkXMLIntegrity(config=config, titles=titles, session=other["session"])
generateXMLDump(config=config, session=other["session"])
checkXMLIntegrity(config=config, session=other["session"])
if config.images:
images += Image.getImageNames(config=config, session=other["session"])
Image.saveImageNames(config=config, images=images, session=other["session"])
@ -143,34 +141,11 @@ class DumpGenerator:
images = []
print("Resuming previous dump process...")
if config.xml:
titles = readTitles(config)
try:
with FileReadBackwards(
"%s/%s-%s-titles.txt"
% (
config.path,
domain2prefix(config=config, session=other["session"]),
config.date,
),
encoding="utf-8",
) as frb:
lasttitle = frb.readline().strip()
if lasttitle == "":
lasttitle = frb.readline().strip()
except:
lasttitle = "" # probably file does not exists
if lasttitle == "--END--":
# titles list is complete
print("Title list was completed in the previous session")
else:
print("Title list is incomplete. Reloading...")
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
getPageTitles(config=config, session=other["session"])
# checking xml dump
xmliscomplete = False
lastxmltitle = None
lastxmlrevid = None
try:
with FileReadBackwards(
"%s/%s-%s-%s.xml"
@ -188,10 +163,14 @@ class DumpGenerator:
xmliscomplete = True
break
xmlrevid = re.search(r" <id>([^<]+)</id>", l)
if xmlrevid:
lastxmlrevid = int(xmlrevid.group(1))
xmltitle = re.search(r"<title>([^<]+)</title>", l)
if xmltitle:
lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
break
except:
pass # probably file does not exists
@ -199,19 +178,16 @@ class DumpGenerator:
print("XML dump was completed in the previous session")
elif lastxmltitle:
# resuming...
print('Resuming XML dump from "%s"' % (lastxmltitle))
titles = readTitles(config, start=lastxmltitle)
print('Resuming XML dump from "%s" (revision id %s)' % (lastxmltitle, lastxmlrevid))
generateXMLDump(
config=config,
titles=titles,
start=lastxmltitle,
session=other["session"],
resume=True,
)
else:
# corrupt? only has XML header?
print("XML is corrupt? Regenerating...")
titles = readTitles(config)
generateXMLDump(config=config, titles=titles, session=other["session"])
generateXMLDump(config=config, session=other["session"])
if config.images:
# load images list

@ -6,11 +6,11 @@ from typing import *
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.utils import domain2prefix
from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSha1Error, FileSizeError
from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSizeError
from wikiteam3.dumpgenerator.api import getJSON
from wikiteam3.dumpgenerator.api import handleStatusCode
from wikiteam3.dumpgenerator.log import logerror
from .page_xml import getXMLPage
from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
from wikiteam3.utils import truncateFilename, sha1File
from wikiteam3.utils import cleanHTML, undoHTMLEntities
from wikiteam3.dumpgenerator.config import Config
@ -317,7 +317,7 @@ class Image:
"aiprop": "url|user|size|sha1",
"aifrom": aifrom,
"format": "json",
"ailimit": 50,
"ailimit": config.api_chunksize,
}
# FIXME Handle HTTP Errors HERE
r = session.get(url=config.api, params=params, timeout=30)
@ -398,7 +398,7 @@ class Image:
"action": "query",
"generator": "allpages",
"gapnamespace": 6,
"gaplimit": 50, # The value must be between 1 and 500.
"gaplimit": config.api_chunksize, # The value must be between 1 and 500.
# TODO: Is it OK to set it higher, for speed?
"gapfrom": gapfrom,
"prop": "imageinfo",

@ -4,8 +4,6 @@ import sys
import time
import requests
from lxml import etree
from lxml.builder import E
from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError
from wikiteam3.dumpgenerator.api import handleStatusCode
@ -196,69 +194,6 @@ def getXMLPage(config: Config=None, title="", verbose=True, session=None):
else:
uprint(" %s, %d edits" % (title.strip(), edit_count))
def makeXmlPageFromRaw(xml) -> str:
"""Discard the metadata around a <page> element in <mediawiki> string"""
root = etree.XML(xml)
find = etree.XPath("//*[local-name() = 'page']")
# The tag will inherit the namespace, like:
# <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
# FIXME: pretty_print doesn't seem to work, only adds a newline
return etree.tostring(find(root)[0], pretty_print=True, encoding="unicode")
def makeXmlFromPage(page: dict) -> str:
"""Output an XML document as a string from a page as in the API JSON"""
try:
p = E.page(
E.title(str(page["title"])),
E.ns(str(page["ns"])),
E.id(str(page["pageid"])),
)
for rev in page["revisions"]:
# Older releases like MediaWiki 1.16 do not return all fields.
if "userid" in rev:
userid = rev["userid"]
else:
userid = 0
if "size" in rev:
size = rev["size"]
else:
size = 0
text_element = E.text(str(rev["*"]), bytes=str(size))
text_element.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
revision = E.revision(
E.id(str(rev["revid"])),
E.timestamp(rev["timestamp"]),
text_element,
)
# The username may be deleted/suppressed
if "user" in rev:
revision.append(
E.contributor(
E.username(str(rev["user"])),
E.id(str(userid)),
)
)
else:
revision.append(E.contributor(deleted="deleted"))
if "comment" in rev and rev["comment"]:
revision.append(E.comment(str(rev["comment"])))
if "contentmodel" in rev:
revision.append(E.model(rev["contentmodel"]))
# Sometimes a missing parentid is not replaced with a 0 as it should.
if "parentid" in rev:
revision.append(E.parentid(str(rev["parentid"])))
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
if "sha1" in rev:
revision.append(E.sha1(rev["sha1"]))
p.append(revision)
except KeyError as e:
print(e)
raise PageMissingError(page["title"], e)
return etree.tostring(p, pretty_print=True, encoding="unicode")
def fixBOM(request):
"""Strip Unicode BOM"""
if request.text.startswith("\ufeff"):

@ -0,0 +1,398 @@
from datetime import datetime
from typing import *
import sys
import time
from urllib.parse import urlparse
import lxml.etree
import mwclient
import requests
from wikiteam3.dumpgenerator.exceptions import PageMissingError
from wikiteam3.dumpgenerator.log import logerror
from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI
from wikiteam3.dumpgenerator.api.page_titles import readTitles
from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import makeXmlFromPage, makeXmlPageFromRaw
from wikiteam3.dumpgenerator.config import Config
def getXMLRevisionsByAllRevisions(config: Config=None, session=None, site: mwclient.Site=None, nscontinue=None, arvcontinue=None):
if "all" not in config.namespaces:
namespaces = config.namespaces
else:
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
_nscontinue = nscontinue
_arvcontinue = arvcontinue
for namespace in namespaces:
if _nscontinue is not None:
if namespace != _nscontinue:
print("Skipping already exported namespace: %d" % namespace)
continue
_nscontinue = None
print("Trying to export all revisions from namespace %s" % namespace)
# arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
arvparams = {
"action": "query",
"list": "allrevisions",
"arvlimit": config.api_chunksize,
"arvnamespace": namespace,
}
if _arvcontinue is not None:
arvparams['arvcontinue'] = _arvcontinue
if not config.curonly:
# We have to build the XML manually...
# Skip flags, presumably needed to add <minor/> which is in the schema.
# Also missing: parentid and contentformat.
arvparams[
"arvprop"
] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content"
print(
"Trying to get wikitext from the allrevisions API and to build the XML"
)
while True:
try:
arvrequest = site.api(
http_method=config.http_method, **arvparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
continue
else:
raise
except requests.exceptions.ReadTimeout as err:
# Hopefully temporary, just wait a bit and continue with the same request.
# No point putting a limit to retries, we'd need to abort everything.
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
# to use the retry adapter we use for our own requests session?
print(f"ERROR: {str(err)}")
print("Sleeping for 20 seconds")
time.sleep(20)
continue
for page in arvrequest["query"]["allrevisions"]:
yield makeXmlFromPage(page, arvparams.get("arvcontinue", ""))
if "continue" in arvrequest:
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
else:
# End of continuation. We are done with this namespace.
break
else:
# FIXME: this is not curonly, just different strategy to do all revisions
# Just cycle through revision IDs and use the XML as is
print("Trying to list the revisions and to export them one by one")
# We only need the revision ID, all the rest will come from the raw export
arvparams["arvprop"] = "ids"
try:
arvrequest = site.api(
http_method=config.http_method, **arvparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
continue
else:
raise
exportparams = {
"action": "query",
"export": "1",
}
# Skip the namespace if it's empty
if len(arvrequest["query"]["allrevisions"]) < 1:
continue
# Repeat the arvrequest with new arvparams until done
while True:
# Reset revision IDs from the previous batch from arv
revids = []
for page in arvrequest["query"]["allrevisions"]:
for revision in page["revisions"]:
revids.append(str(revision["revid"]))
print(
" %d more revisions listed, until %s"
% (len(revids), revids[-1])
)
# We can now get the XML for one revision at a time
# FIXME: we can actually get them in batches as we used to
# but need to figure out the continuation and avoid that the API
# chooses to give us only the latest for each page
for revid in revids:
exportparams["revids"] = revid
try:
exportrequest = site.api(
http_method=config.http_method, **exportparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print(
"POST request to the API failed, retrying with GET"
)
config.http_method = "GET"
exportrequest = site.api(
http_method=config.http_method, **exportparams
)
else:
raise
# This gives us a self-standing <mediawiki> element
# but we only need the inner <page>: we can live with
# duplication and non-ordering of page titles, but the
# repeated header is confusing and would not even be valid
xml = exportrequest["query"]["export"]["*"] # type(xml) == str
yield makeXmlPageFromRaw(xml, arvparams.get("arvcontinue", ""))
if "continue" in arvrequest:
# Get the new ones
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
try:
arvrequest = site.api(
http_method=config.http_method, **arvparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print(
"POST request to the API failed, retrying with GET"
)
config.http_method = "GET"
arvrequest = site.api(
http_method=config.http_method, **arvparams
)
except requests.exceptions.ReadTimeout as err:
# As above
print(f"ERROR: {str(err)}")
print("Sleeping for 20 seconds")
time.sleep(20)
# But avoid rewriting the same revisions
arvrequest["query"]["allrevisions"] = []
continue
else:
# End of continuation. We are done with this namespace.
break
def getXMLRevisionsByTitles(config: Config=None, session=None, site: mwclient.Site=None, start=None):
if config.curonly:
# The raw XML export in the API gets a title and gives the latest revision.
# We could also use the allpages API as generator but let's be consistent.
print("Getting titles to export the latest revision for each")
c = 0
for title in readTitles(config, session=session, start=start):
# TODO: respect verbose flag, reuse output from getXMLPage
print(f" {title}")
# TODO: as we're doing one page and revision at a time, we might
# as well use xml format and exportnowrap=1 to use the string of,
# XML as is, but need to check how well the library handles it.
exportparams = {
"action": "query",
"titles": title,
"export": "1",
}
try:
exportrequest = site.api(
http_method=config.http_method, **exportparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
exportrequest = site.api(
http_method=config.http_method, **exportparams
)
else:
raise
xml = str(exportrequest["query"]["export"]["*"])
c += 1
if c % 10 == 0:
print(f"\n-> Downloaded {c} pages\n")
# Because we got the fancy XML from the JSON format, clean it:
yield makeXmlPageFromRaw(xml, None)
else:
# This is the closest to what we usually do with Special:Export:
# take one title at a time and try to get all revisions exported.
# It differs from the allrevisions method because it actually needs
# to be input the page titles; otherwise, the requests are similar.
# The XML needs to be made manually because the export=1 option
# refuses to return an arbitrary number of revisions (see above).
print("Getting titles to export all the revisions of each")
c = 0
titlelist = []
# TODO: Decide a suitable number of a batched request. Careful:
# batched responses may not return all revisions.
for titlelist in readTitles(config, session=session, start=start, batch=False):
if type(titlelist) is not list:
titlelist = [titlelist]
for title in titlelist:
print(f" {title}")
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
pparams = {
"action": "query",
"titles": "|".join(titlelist),
"prop": "revisions",
# 'rvlimit': 50,
"rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content",
}
try:
prequest = site.api(http_method=config.http_method, **pparams)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
prequest = site.api(
http_method=config.http_method, **pparams
)
else:
raise
except mwclient.errors.InvalidResponse:
logerror(
config=config, to_stdout=True,
text="Error: page inaccessible? Could not export page: %s"
% ("; ".join(titlelist)),
)
continue
# Be ready to iterate if there is continuation.
while True:
# Get the revision data returned by the API: prequest is the initial request
# or the new one after continuation at the bottom of this while loop.
# The array is called "pages" even if there's only one.
try:
pages = prequest["query"]["pages"]
except KeyError:
logerror(
config=config, to_stdout=True,
text="Error: page inaccessible? Could not export page: %s"
% ("; ".join(titlelist)),
)
break
# Go through the data we got to build the XML.
for pageid in pages:
try:
xml = makeXmlFromPage(pages[pageid], None)
yield xml
except PageMissingError:
logerror(
config=config, to_stdout=True,
text="Error: empty revision from API. Could not export page: %s"
% ("; ".join(titlelist)),
)
continue
# Get next batch of revisions if there's more.
if "continue" in prequest.keys():
print("Getting more revisions for the page")
for key, value in prequest["continue"]:
pparams[key] = value
elif "query-continue" in prequest.keys():
rvstartid = prequest["query-continue"]["revisions"]["rvstartid"]
pparams["rvstartid"] = rvstartid
else:
break
try:
prequest = site.api(
http_method=config.http_method, **pparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
prequest = site.api(
http_method=config.http_method, **pparams
)
# We're done iterating for this title or titles.
c += len(titlelist)
# Reset for the next batch.
titlelist = []
if c % 10 == 0:
print(f"\n-> Downloaded {c} pages\n")
def getXMLRevisions(config: Config=None, session=None, useAllrevision=True, lastPage=None):
# FIXME: actually figure out the various strategies for each MediaWiki version
apiurl = urlparse(config.api)
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
# https://github.com/WikiTeam/wikiteam/issues/358
site = mwclient.Site(
apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session
)
if useAllrevision:
# Find last title
if lastPage:
try:
lastNs = int(lastPage.find('ns').text)
if False:
lastRevision = lastPage.find('revision')
lastTimestamp = lastRevision.find('timestamp').text
lastRevid = int(lastRevision.find('id').text)
lastDatetime = datetime.fromisoformat(lastTimestamp.rstrip('Z'))
lastArvcontinue = lastDatetime.strftime("%Y%m%d%H%M%S") + '|' + str(lastRevid)
else:
lastArvcontinue = lastPage.attrib['arvcontinue']
except Exception:
print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
raise
nscontinue = lastNs
arvcontinue = lastArvcontinue
if not arvcontinue:
arvcontinue = None
else:
nscontinue = None
arvcontinue = None
try:
return getXMLRevisionsByAllRevisions(config, session, site, nscontinue, arvcontinue)
except (KeyError, mwclient.errors.InvalidResponse) as e:
print(e)
# TODO: check whether the KeyError was really for a missing arv API
print("Warning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page")
sys.exit()
else:
# Find last title
if lastPage:
try:
start = lastPage.find('title')
except Exception:
print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
raise
else:
start = None
try:
# # Uncomment these lines to raise an KeyError for testing
# raise KeyError(999999)
# # DO NOT UNCOMMMENT IN RELEASE
return getXMLRevisionsByTitles(config, session, site, start)
except mwclient.errors.MwClientError as e:
print(e)
print("This mwclient version seems not to work for us. Exiting.")
sys.exit()

@ -0,0 +1,104 @@
from lxml import etree
from lxml.builder import E
from wikiteam3.dumpgenerator.exceptions import PageMissingError
def makeXmlPageFromRaw(xml, arvcontinue) -> str:
"""Discard the metadata around a <page> element in <mediawiki> string"""
root = etree.XML(xml)
find = etree.XPath("//*[local-name() = 'page']")
page = find(root)[0]
if arvcontinue is not None:
page.attrib['arvcontinue'] = arvcontinue
# The tag will inherit the namespace, like:
# <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
# FIXME: pretty_print doesn't seem to work, only adds a newline
return etree.tostring(page, pretty_print=True, encoding="unicode")
def makeXmlFromPage(page: dict, arvcontinue) -> str:
"""Output an XML document as a string from a page as in the API JSON"""
try:
p = E.page(
E.title(str(page["title"])),
E.ns(str(page["ns"])),
E.id(str(page["pageid"])),
)
if arvcontinue is not None:
p.attrib['arvcontinue'] = arvcontinue
for rev in page["revisions"]:
# Older releases like MediaWiki 1.16 do not return all fields.
if "userid" in rev:
userid = rev["userid"]
else:
userid = 0
if "size" in rev:
size = rev["size"]
else:
size = 0
# Create rev object
revision = [E.id(str(rev["revid"])),
E.timestamp(rev["timestamp"]),]
# The text, user, comment, sha1 may be deleted/suppressed
if 'texthidden' in rev:
revision.append(E.text(**{
'bytes': str(size),
'deleted': 'deleted',
}))
else:
text = str(rev["*"])
revision.append(E.text(text, **{
'bytes': str(size),
'{http://www.w3.org/XML/1998/namespace}space': 'preserve',
}))
if not "user" in rev:
if not "userhidden" in rev:
print("Warning: user not hidden but missing user in pageid %d revid %d" % (page['pageid'], rev['revid']))
revision.append(E.contributor(deleted="deleted"))
else:
revision.append(
E.contributor(
E.username(str(rev["user"])),
E.id(str(userid)),
)
)
if not "sha1" in rev:
if "sha1hidden" in rev:
revision.append(E.sha1()) # stub
else:
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
pass
elif "sha1" in rev:
revision.append(E.sha1(rev["sha1"]))
if 'commenthidden' in rev:
revision.append(E.comment(deleted="deleted"))
elif "comment" in rev and rev["comment"]:
revision.append(E.comment(str(rev["comment"])))
if "contentmodel" in rev:
revision.append(E.model(rev["contentmodel"]))
# Sometimes a missing parentid is not replaced with a 0 as it should.
if "parentid" in rev:
revision.append(E.parentid(str(rev["parentid"])))
# mwcli's dump.xml order
revisionTags = ['id', 'parentid', 'timestamp', 'contributor', 'comment', 'origin', 'model', 'format', 'text', 'sha1']
revisionElementsDict = {elem.tag: elem for elem in revision}
_revision = E.revision()
for tag in revisionTags:
if tag in revisionElementsDict:
_revision.append(revisionElementsDict.pop(tag))
for elem in revisionElementsDict.values():
_revision.append(elem)
p.append(_revision)
except KeyError as e:
print(e)
raise PageMissingError(page["title"], e)
return etree.tostring(p, pretty_print=True, encoding="unicode")

@ -0,0 +1,144 @@
import re
import sys
from typing import *
import lxml.etree
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.utils import domain2prefix
from wikiteam3.dumpgenerator.exceptions import PageMissingError
from wikiteam3.dumpgenerator.log import logerror
from wikiteam3.dumpgenerator.api.page_titles import readTitles
from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.utils import cleanXML, undoHTMLEntities
from wikiteam3.dumpgenerator.dump.xmldump.xml_header import getXMLHeader
from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions import getXMLRevisions
from wikiteam3.dumpgenerator.dump.xmldump.xml_truncate import truncateXMLDump, parseLastPageChunk
def doXMLRevisionDump(config: Config=None, session=None, xmlfile=None, lastPage=None, useAllrevisions=False):
try:
r_timestamp = "<timestamp>([^<]+)</timestamp>"
r_arvcontinue = '<page arvcontinue="(.*?)">'
lastArvcontinue = None
for xml in getXMLRevisions(config=config, session=session, lastPage=lastPage, useAllrevision=useAllrevisions):
numrevs = len(re.findall(r_timestamp, xml))
arvcontinueRe = re.findall(r_arvcontinue, xml)
if arvcontinueRe:
curArvcontinue = arvcontinueRe[0]
if lastArvcontinue != curArvcontinue:
Delay(config=config, session=session)
lastArvcontinue = curArvcontinue
# Due to how generators work, it's expected this may be less
xml = cleanXML(xml=xml)
xmlfile.write(xml)
xmltitle = re.search(r"<title>([^<]+)</title>", xml)
title = undoHTMLEntities(text=xmltitle.group(1))
print(f'{title}, {numrevs} edits (--xmlrevisions)')
# Delay(config=config, session=session)
except AttributeError as e:
print(e)
print("This API library version is not working")
sys.exit()
except UnicodeEncodeError as e:
print(e)
def doXMLExportDump(config: Config=None, session=None, xmlfile=None, lastPage=None):
print(
'\nRetrieving the XML for every page\n'
)
lock = True
start = None
if lastPage:
try:
start = lastPage.find('title')
except Exception:
print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
raise
else:
# requested complete xml dump
lock = False
c = 1
for title in readTitles(config, session=session, start=start):
if not title:
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
Delay(config=config, session=session)
if c % 10 == 0:
print(f"\n-> Downloaded {c} pages\n")
try:
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml)
except PageMissingError:
logerror(
config=config, to_stdout=True,
text='The page "%s" was missing in the wiki (probably deleted)'
% title,
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
def generateXMLDump(config: Config=None, resume=False, session=None):
"""Generates a XML dump for a list of titles or from revision IDs"""
header, config = getXMLHeader(config=config, session=session)
footer = "</mediawiki>\n" # new line at the end
xmlfilename = "{}-{}-{}.xml".format(
domain2prefix(config=config),
config.date,
"current" if config.curonly else "history",
)
xmlfile = None
lastPage = None
lastPageChunk = None
# start != None, means we are resuming a XML dump
if resume:
print(
"Removing the last chunk of past XML dump: it is probably incomplete."
)
# truncate XML dump if it already exists
lastPageChunk = truncateXMLDump("{}/{}".format(config.path, xmlfilename))
if not lastPageChunk.strip():
print("Last page chunk is NULL, we'll directly start a new dump!")
resume = False
lastPage = None
else:
lastPage = parseLastPageChunk(lastPageChunk)
if lastPage is None:
print("Failed to parse last page chunk: \n%s" % lastPageChunk)
print("Cannot resume, exiting now!")
sys.exit(1)
print(f"WARNING: will try to start the download...")
xmlfile = open(
"{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
)
else:
print("\nRetrieving the XML for every page from the beginning\n")
xmlfile = open(
"{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
)
xmlfile.write(header)
if config.xmlrevisions and not config.xmlrevisions_page:
doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=True)
elif config.xmlrevisions and config.xmlrevisions_page:
doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=False)
else: # --xml
doXMLExportDump(config, session, xmlfile, lastPage)
xmlfile.write(footer)
xmlfile.close()
print("XML dump saved at...", xmlfilename)

@ -7,7 +7,7 @@ import requests
from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError
from wikiteam3.dumpgenerator.log import logerror
from wikiteam3.dumpgenerator.dump.page.page_xml import getXMLPage
from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
from wikiteam3.dumpgenerator.config import Config
def getXMLHeader(config: Config=None, session=None) -> Tuple[str, Config]:

@ -3,6 +3,7 @@ from wikiteam3.dumpgenerator.config import Config
def checkXMLIntegrity(config: Config=None, titles: Iterable[str]=None, session=None):
"""Check XML dump integrity, to detect broken XML chunks"""
# TODO: Fix XML Integrity Check
return
print("Verifying dump...")

@ -1,5 +1,8 @@
from io import StringIO
from typing import *
import os
import lxml.etree
from file_read_backwards import FileReadBackwards
@ -21,17 +24,17 @@ def addNewline(filename: str) -> None:
f.write("\n")
def truncateXMLDump(filename: str) -> None:
def truncateXMLDump(filename: str) -> str:
"""Removes incomplete <page> elements from the end of XML dump files"""
with FileReadBackwards(filename, encoding="utf-8") as frb:
incomplete_segment: str = ""
xml_line: str = frb.readline()
while xml_line and "</title>" not in xml_line:
incomplete_segment += xml_line
incomplete_segment = xml_line + incomplete_segment
xml_line = frb.readline()
while xml_line and "</page>" not in xml_line:
incomplete_segment += xml_line
incomplete_segment = xml_line + incomplete_segment
xml_line = frb.readline()
incomplete_segment_size = len(incomplete_segment.encode("utf-8"))
file_size = os.path.getsize(filename)
@ -56,3 +59,12 @@ def truncateXMLDump(filename: str) -> None:
print(
f"WARNING: {filename} has {endsWithNewlines(filename)} newlines"
)
return incomplete_segment
def parseLastPageChunk(chunk) -> Optional[lxml.etree._ElementTree]:
try:
parser = lxml.etree.XMLParser(recover=True)
tree = lxml.etree.parse(StringIO(chunk), parser)
return tree.getroot()
except lxml.etree.LxmlError:
return None

@ -1,117 +0,0 @@
import re
import sys
from typing import *
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.utils import domain2prefix
from wikiteam3.dumpgenerator.exceptions import PageMissingError
from wikiteam3.dumpgenerator.log import logerror
from wikiteam3.dumpgenerator.dump.page.page_titles import readTitles
from wikiteam3.dumpgenerator.dump.page.page_xml import getXMLPage
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.utils import cleanXML, undoHTMLEntities
from .xml_header import getXMLHeader
from .xml_revisions import getXMLRevisions
from .xml_truncate import truncateXMLDump
def generateXMLDump(config: Config=None, titles: Iterable[str]=None, start=None, session=None):
"""Generates a XML dump for a list of titles or from revision IDs"""
# TODO: titles is now unused.
header, config = getXMLHeader(config=config, session=session)
footer = "</mediawiki>\n" # new line at the end
xmlfilename = "{}-{}-{}.xml".format(
domain2prefix(config=config),
config.date,
"current" if config.curonly else "history",
)
xmlfile = ""
lock = True
# start != None, means we are resuming a XML dump
if start:
print(
"Removing the last chunk of past XML dump: it is probably incomplete."
)
# truncate XML dump if it already exists
truncateXMLDump("{}/{}".format(config.path, xmlfilename))
if config.xmlrevisions:
if start:
print(f"WARNING: will try to start the download from title: {start}")
xmlfile = open(
"{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
)
else:
print("\nRetrieving the XML for every page from the beginning\n")
xmlfile = open(
"{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
)
xmlfile.write(header)
try:
r_timestamp = "<timestamp>([^<]+)</timestamp>"
for xml in getXMLRevisions(config=config, session=session, start=start):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
xml = cleanXML(xml=xml)
xmlfile.write(xml)
xmltitle = re.search(r"<title>([^<]+)</title>", xml)
title = undoHTMLEntities(text=xmltitle.group(1))
print(f'{title}, {numrevs} edits (--xmlrevisions)')
Delay(config=config, session=session)
except AttributeError as e:
print(e)
print("This API library version is not working")
sys.exit()
except UnicodeEncodeError as e:
print(e)
else: # --xml
print(
'\nRetrieving the XML for every page from "%s"\n'
% (start if start else "start")
)
if not start:
# requested complete xml dump
lock = False
xmlfile = open(
"{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
)
xmlfile.write(header)
xmlfile.close()
xmlfile = open(
"{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
)
c = 1
for title in readTitles(config, start):
if not title:
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
Delay(config=config, session=session)
if c % 10 == 0:
print(f"\n-> Downloaded {c} pages\n")
try:
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml)
except PageMissingError:
logerror(
config=config, to_stdout=True,
text='The page "%s" was missing in the wiki (probably deleted)'
% title,
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
xmlfile.write(footer)
xmlfile.close()
print("XML dump saved at...", xmlfilename)

@ -1,342 +0,0 @@
import sys
import time
from urllib.parse import urlparse
import mwclient
import requests
from wikiteam3.dumpgenerator.exceptions import PageMissingError
from wikiteam3.dumpgenerator.log import logerror
from .namespaces import getNamespacesAPI
from wikiteam3.dumpgenerator.dump.page.page_titles import readTitles
from wikiteam3.dumpgenerator.dump.page.page_xml import makeXmlFromPage, makeXmlPageFromRaw
from wikiteam3.dumpgenerator.config import Config
def getXMLRevisions(config: Config=None, session=None, allpages=False, start=None):
# FIXME: actually figure out the various strategies for each MediaWiki version
apiurl = urlparse(config.api)
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
# https://github.com/WikiTeam/wikiteam/issues/358
site = mwclient.Site(
apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session
)
if "all" not in config.namespaces:
namespaces = config.namespaces
else:
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
try:
# # Uncomment these lines to raise an KeyError for testing
# raise KeyError(999999)
# # DO NOT UNCOMMMENT IN RELEASE
for namespace in namespaces:
print("Trying to export all revisions from namespace %s" % namespace)
# arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
arvparams = {
"action": "query",
"list": "allrevisions",
"arvlimit": 50,
"arvnamespace": namespace,
}
if not config.curonly:
# We have to build the XML manually...
# Skip flags, presumably needed to add <minor/> which is in the schema.
# Also missing: parentid and contentformat.
arvparams[
"arvprop"
] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content"
print(
"Trying to get wikitext from the allrevisions API and to build the XML"
)
while True:
try:
arvrequest = site.api(
http_method=config.http_method, **arvparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
continue
else:
raise
except requests.exceptions.ReadTimeout as err:
# Hopefully temporary, just wait a bit and continue with the same request.
# No point putting a limit to retries, we'd need to abort everything.
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
# to use the retry adapter we use for our own requests session?
print(f"ERROR: {str(err)}")
print("Sleeping for 20 seconds")
time.sleep(20)
continue
for page in arvrequest["query"]["allrevisions"]:
yield makeXmlFromPage(page)
if "continue" in arvrequest:
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
else:
# End of continuation. We are done with this namespace.
break
else:
# FIXME: this is not curonly, just different strategy to do all revisions
# Just cycle through revision IDs and use the XML as is
print("Trying to list the revisions and to export them one by one")
# We only need the revision ID, all the rest will come from the raw export
arvparams["arvprop"] = "ids"
try:
arvrequest = site.api(
http_method=config.http_method, **arvparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
continue
else:
raise
exportparams = {
"action": "query",
"export": "1",
}
# Skip the namespace if it's empty
if len(arvrequest["query"]["allrevisions"]) < 1:
continue
# Repeat the arvrequest with new arvparams until done
while True:
# Reset revision IDs from the previous batch from arv
revids = []
for page in arvrequest["query"]["allrevisions"]:
for revision in page["revisions"]:
revids.append(str(revision["revid"]))
print(
" %d more revisions listed, until %s"
% (len(revids), revids[-1])
)
# We can now get the XML for one revision at a time
# FIXME: we can actually get them in batches as we used to
# but need to figure out the continuation and avoid that the API
# chooses to give us only the latest for each page
for revid in revids:
exportparams["revids"] = revid
try:
exportrequest = site.api(
http_method=config.http_method, **exportparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print(
"POST request to the API failed, retrying with GET"
)
config.http_method = "GET"
exportrequest = site.api(
http_method=config.http_method, **exportparams
)
else:
raise
# This gives us a self-standing <mediawiki> element
# but we only need the inner <page>: we can live with
# duplication and non-ordering of page titles, but the
# repeated header is confusing and would not even be valid
xml = exportrequest["query"]["export"]["*"] # type(xml) == str
yield makeXmlPageFromRaw(xml)
if "continue" in arvrequest:
# Get the new ones
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
try:
arvrequest = site.api(
http_method=config.http_method, **arvparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print(
"POST request to the API failed, retrying with GET"
)
config.http_method = "GET"
arvrequest = site.api(
http_method=config.http_method, **arvparams
)
except requests.exceptions.ReadTimeout as err:
# As above
print(f"ERROR: {str(err)}")
print("Sleeping for 20 seconds")
time.sleep(20)
# But avoid rewriting the same revisions
arvrequest["query"]["allrevisions"] = []
continue
else:
# End of continuation. We are done with this namespace.
break
except (KeyError, mwclient.errors.InvalidResponse) as e:
print(e)
# TODO: check whether the KeyError was really for a missing arv API
print("Warning. Could not use allrevisions. Wiki too old?")
if config.curonly:
# The raw XML export in the API gets a title and gives the latest revision.
# We could also use the allpages API as generator but let's be consistent.
print("Getting titles to export the latest revision for each")
c = 0
for title in readTitles(config, start=start):
# TODO: respect verbose flag, reuse output from getXMLPage
print(f" {title}")
# TODO: as we're doing one page and revision at a time, we might
# as well use xml format and exportnowrap=1 to use the string of,
# XML as is, but need to check how well the library handles it.
exportparams = {
"action": "query",
"titles": title,
"export": "1",
}
try:
exportrequest = site.api(
http_method=config.http_method, **exportparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
exportrequest = site.api(
http_method=config.http_method, **exportparams
)
else:
raise
xml = str(exportrequest["query"]["export"]["*"])
c += 1
if c % 10 == 0:
print(f"\n-> Downloaded {c} pages\n")
# Because we got the fancy XML from the JSON format, clean it:
yield makeXmlPageFromRaw(xml)
else:
# This is the closest to what we usually do with Special:Export:
# take one title at a time and try to get all revisions exported.
# It differs from the allrevisions method because it actually needs
# to be input the page titles; otherwise, the requests are similar.
# The XML needs to be made manually because the export=1 option
# refuses to return an arbitrary number of revisions (see above).
print("Getting titles to export all the revisions of each")
c = 0
titlelist = []
# TODO: Decide a suitable number of a batched request. Careful:
# batched responses may not return all revisions.
for titlelist in readTitles(config, start=start, batch=False):
if type(titlelist) is not list:
titlelist = [titlelist]
for title in titlelist:
print(f" {title}")
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
pparams = {
"action": "query",
"titles": "|".join(titlelist),
"prop": "revisions",
# 'rvlimit': 50,
"rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content",
}
try:
prequest = site.api(http_method=config.http_method, **pparams)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
prequest = site.api(
http_method=config.http_method, **pparams
)
else:
raise
except mwclient.errors.InvalidResponse:
logerror(
config=config, to_stdout=True,
text="Error: page inaccessible? Could not export page: %s"
% ("; ".join(titlelist)),
)
continue
# Be ready to iterate if there is continuation.
while True:
# Get the revision data returned by the API: prequest is the initial request
# or the new one after continuation at the bottom of this while loop.
# The array is called "pages" even if there's only one.
try:
pages = prequest["query"]["pages"]
except KeyError:
logerror(
config=config, to_stdout=True,
text="Error: page inaccessible? Could not export page: %s"
% ("; ".join(titlelist)),
)
break
# Go through the data we got to build the XML.
for pageid in pages:
try:
xml = makeXmlFromPage(pages[pageid])
yield xml
except PageMissingError:
logerror(
config=config, to_stdout=True,
text="Error: empty revision from API. Could not export page: %s"
% ("; ".join(titlelist)),
)
continue
# Get next batch of revisions if there's more.
if "continue" in prequest.keys():
print("Getting more revisions for the page")
for key, value in prequest["continue"]:
pparams[key] = value
elif "query-continue" in prequest.keys():
rvstartid = prequest["query-continue"]["revisions"]["rvstartid"]
pparams["rvstartid"] = rvstartid
else:
break
try:
prequest = site.api(
http_method=config.http_method, **pparams
)
except requests.exceptions.HTTPError as e:
if (
e.response.status_code == 405
and config.http_method == "POST"
):
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
prequest = site.api(
http_method=config.http_method, **pparams
)
# We're done iterating for this title or titles.
c += len(titlelist)
# Reset for the next batch.
titlelist = []
if c % 10 == 0:
print(f"\n-> Downloaded {c} pages\n")
except mwclient.errors.MwClientError as e:
print(e)
print("This mwclient version seems not to work for us. Exiting.")
sys.exit()
Loading…
Cancel
Save