Refactor xml dump related codes and improve xmlrevision logic (#103)

1 year ago · be983b0814
parent 0fb53ffdde
commit be983b0814
19 changed files with 727 additions and 578 deletions
--- a/wikiteam3/dumpgenerator/dump/xmlrev/namespaces.py
+++ b/wikiteam3/dumpgenerator/dump/xmlrev/namespaces.py
--- a/wikiteam3/dumpgenerator/dump/page/page_titles.py
+++ b/wikiteam3/dumpgenerator/dump/page/page_titles.py
@ -3,9 +3,10 @@ import sys
 from urllib.parse import urlparse

 import mwclient
+from file_read_backwards import FileReadBackwards

 from wikiteam3.dumpgenerator.cli import Delay
-from wikiteam3.dumpgenerator.dump.xmlrev.namespaces import getNamespacesAPI, getNamespacesScraper
+from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI, getNamespacesScraper
 from wikiteam3.utils import domain2prefix, cleanHTML, undoHTMLEntities
 from wikiteam3.dumpgenerator.config import Config

@ -208,9 +209,32 @@ def getPageTitles(config: Config=None, session=None):
    print("%d page titles loaded" % (c))
    return titlesfilename

+def checkTitleOk(config: Config=None, ):
+    try:
+        with FileReadBackwards(
+                "%s/%s-%s-titles.txt"
+                % (
+                        config.path,
+                        domain2prefix(config=config),
+                        config.date,
+                ),
+                encoding="utf-8",
+        ) as frb:
+            lasttitle = frb.readline().strip()
+            if lasttitle == "":
+                lasttitle = frb.readline().strip()
+    except:
+        lasttitle = ""  # probably file does not exists

-def readTitles(config: Config=None, start=None, batch=False):
+    if lasttitle != "--END--":
+        return False
+    return True
+
+
+def readTitles(config: Config=None, session=None, start=None, batch=False):
    """Read title list from a file, from the title "start" """
+    if not checkTitleOk(config):
+        getPageTitles(config=config)

    titlesfilename = "{}-{}-titles.txt".format(
        domain2prefix(config=config), config.date
--- a/wikiteam3/dumpgenerator/cli/cli.py
+++ b/wikiteam3/dumpgenerator/cli/cli.py
@ -78,6 +78,11 @@ def getArgumentParser():
        action="store_true",
        help="download all revisions from an API generator. MediaWiki 1.27+ only.",
    )
+    groupDownload.add_argument(
+        "--xmlrevisions_page",
+        action="store_true",
+        help="download all revisions from an API generator, but query page by page MediaWiki 1.27+ only.",
+    )
    groupDownload.add_argument(
        "--images", action="store_true", help="generates an image dump"
    )
@ -91,6 +96,9 @@ def getArgumentParser():
        metavar="1,2,3",
        help="comma-separated value of namespaces to exclude",
    )
+    parser.add_argument(
+        "--api_chunksize", metavar="50", default=50, help="Chunk size for MediaWiki API (arvlimit, ailimit, etc.)"
+    )

    # Meta info params
    groupMeta = parser.add_argument_group(
@ -141,7 +149,9 @@ def getParameters(params=None) -> Tuple[Config, Dict]:

        # Courtesy datashaman https://stackoverflow.com/a/35504626
        __retries__ = Retry(
-            total=int(args.retries), backoff_factor=2, status_forcelist=[500, 502, 503, 504, 429]
+            total=int(args.retries), backoff_factor=2,
+            status_forcelist=[500, 502, 503, 504, 429],
+            allowed_methods=['DELETE', 'PUT', 'GET', 'OPTIONS', 'TRACE', 'HEAD', 'POST']
        )
        session.mount("https://", HTTPAdapter(max_retries=__retries__))
        session.mount("http://", HTTPAdapter(max_retries=__retries__))
@ -288,11 +298,13 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
        "api": api,
        "failfast": args.failfast,
        "http_method": "POST",
+        "api_chunksize": args.api_chunksize,
        "index": index,
        "images": args.images,
        "logs": False,
        "xml": args.xml,
-        "xmlrevisions": args.xmlrevisions,
+        "xmlrevisions": args.xmlrevisions or args.xmlrevisions_page,
+        "xmlrevisions_page": args.xmlrevisions_page,
        "namespaces": namespaces,
        "exnamespaces": exnamespaces,
        "path": args.path and os.path.normpath(args.path) or "",
--- a/wikiteam3/dumpgenerator/config.py
+++ b/wikiteam3/dumpgenerator/config.py
@ -53,10 +53,12 @@ class Config:
    xml: bool = False
    curonly: bool = False
    xmlrevisions: bool = False
+    xmlrevisions_page: bool = False
    images: bool = False
    namespaces: List[int] = None
    exnamespaces: List[int] = None

+    api_chunksize: int = 0  # arvlimit, ailimit, etc
    export: str = '' # Special:Export page name
    http_method: str = ''

--- a/wikiteam3/dumpgenerator/dump/generator.py
+++ b/wikiteam3/dumpgenerator/dump/generator.py
@ -30,14 +30,13 @@ from wikiteam3.utils import truncateFilename
 from wikiteam3.utils import undoHTMLEntities
 from wikiteam3.utils import avoidWikimediaProjects

-from .page.image import Image
-from .misc.index_php import saveIndexPHP
-from .misc.special_logs import saveLogs
-from .misc.special_version import saveSpecialVersion
-from .page.page_titles import getPageTitles, readTitles
-from .misc.site_info import saveSiteInfo
-from .xmlrev.xml_dump import generateXMLDump
-from .xmlrev.xml_integrity import checkXMLIntegrity
+from wikiteam3.dumpgenerator.dump.image.image import Image
+from wikiteam3.dumpgenerator.dump.misc.index_php import saveIndexPHP
+from wikiteam3.dumpgenerator.dump.misc.special_logs import saveLogs
+from wikiteam3.dumpgenerator.dump.misc.special_version import saveSpecialVersion
+from wikiteam3.dumpgenerator.dump.misc.site_info import saveSiteInfo
+from wikiteam3.dumpgenerator.dump.xmldump.xml_dump import generateXMLDump
+from wikiteam3.dumpgenerator.dump.xmldump.xml_integrity import checkXMLIntegrity

 # From https://stackoverflow.com/a/57008707
 class Tee(object):
@ -122,13 +121,12 @@ class DumpGenerator:

    @staticmethod
    def createNewDump(config: Config=None, other: Dict=None):
+        # we do lazy title dumping here :)
        images = []
        print("Trying generating a new dump into a new directory...")
        if config.xml:
-            getPageTitles(config=config, session=other["session"])
-            titles = readTitles(config)
-            generateXMLDump(config=config, titles=titles, session=other["session"])
-            checkXMLIntegrity(config=config, titles=titles, session=other["session"])
+            generateXMLDump(config=config, session=other["session"])
+            checkXMLIntegrity(config=config, session=other["session"])
        if config.images:
            images += Image.getImageNames(config=config, session=other["session"])
            Image.saveImageNames(config=config, images=images, session=other["session"])
@ -143,34 +141,11 @@ class DumpGenerator:
        images = []
        print("Resuming previous dump process...")
        if config.xml:
-            titles = readTitles(config)
-            try:
-                with FileReadBackwards(
-                    "%s/%s-%s-titles.txt"
-                    % (
-                        config.path,
-                        domain2prefix(config=config, session=other["session"]),
-                        config.date,
-                    ),
-                    encoding="utf-8",
-                ) as frb:
-                    lasttitle = frb.readline().strip()
-                    if lasttitle == "":
-                        lasttitle = frb.readline().strip()
-            except:
-                lasttitle = ""  # probably file does not exists
-            if lasttitle == "--END--":
-                # titles list is complete
-                print("Title list was completed in the previous session")
-            else:
-                print("Title list is incomplete. Reloading...")
-                # do not resume, reload, to avoid inconsistences, deleted pages or
-                # so
-                getPageTitles(config=config, session=other["session"])

            # checking xml dump
            xmliscomplete = False
            lastxmltitle = None
+            lastxmlrevid = None
            try:
                with FileReadBackwards(
                    "%s/%s-%s-%s.xml"
@ -188,10 +163,14 @@ class DumpGenerator:
                            xmliscomplete = True
                            break

+                        xmlrevid = re.search(r"    <id>([^<]+)</id>", l)
+                        if xmlrevid:
+                            lastxmlrevid = int(xmlrevid.group(1))
                        xmltitle = re.search(r"<title>([^<]+)</title>", l)
                        if xmltitle:
                            lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
                            break
+
            except:
                pass  # probably file does not exists

@ -199,19 +178,16 @@ class DumpGenerator:
                print("XML dump was completed in the previous session")
            elif lastxmltitle:
                # resuming...
-                print('Resuming XML dump from "%s"' % (lastxmltitle))
-                titles = readTitles(config, start=lastxmltitle)
+                print('Resuming XML dump from "%s" (revision id %s)' % (lastxmltitle, lastxmlrevid))
                generateXMLDump(
                    config=config,
-                    titles=titles,
-                    start=lastxmltitle,
                    session=other["session"],
+                    resume=True,
                )
            else:
                # corrupt? only has XML header?
                print("XML is corrupt? Regenerating...")
-                titles = readTitles(config)
-                generateXMLDump(config=config, titles=titles, session=other["session"])
+                generateXMLDump(config=config, session=other["session"])

        if config.images:
            # load images list
--- a/wikiteam3/dumpgenerator/dump/xmlrev/init.py
+++ b/wikiteam3/dumpgenerator/dump/xmlrev/init.py
--- a/wikiteam3/dumpgenerator/dump/image/image.py
+++ b/wikiteam3/dumpgenerator/dump/image/image.py
@ -6,11 +6,11 @@ from typing import *

 from wikiteam3.dumpgenerator.cli import Delay
 from wikiteam3.utils import domain2prefix
-from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSha1Error, FileSizeError
+from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSizeError
 from wikiteam3.dumpgenerator.api import getJSON
 from wikiteam3.dumpgenerator.api import handleStatusCode
 from wikiteam3.dumpgenerator.log import logerror
-from .page_xml import getXMLPage
+from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
 from wikiteam3.utils import truncateFilename, sha1File
 from wikiteam3.utils import cleanHTML, undoHTMLEntities
 from wikiteam3.dumpgenerator.config import Config
@ -317,7 +317,7 @@ class Image:
                "aiprop": "url|user|size|sha1",
                "aifrom": aifrom,
                "format": "json",
-                "ailimit": 50,
+                "ailimit": config.api_chunksize,
            }
            # FIXME Handle HTTP Errors HERE
            r = session.get(url=config.api, params=params, timeout=30)
@ -398,7 +398,7 @@ class Image:
                    "action": "query",
                    "generator": "allpages",
                    "gapnamespace": 6,
-                    "gaplimit": 50, # The value must be between 1 and 500.
+                    "gaplimit": config.api_chunksize, # The value must be between 1 and 500.
                                    # TODO: Is it OK to set it higher, for speed?
                    "gapfrom": gapfrom,
                    "prop": "imageinfo",
--- a/wikiteam3/dumpgenerator/dump/page/xmlexport/init.py
+++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/init.py
--- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py
+++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py
@ -4,8 +4,6 @@ import sys
 import time

 import requests
-from lxml import etree
-from lxml.builder import E

 from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError
 from wikiteam3.dumpgenerator.api import handleStatusCode
@ -196,69 +194,6 @@ def getXMLPage(config: Config=None, title="", verbose=True, session=None):
        else:
            uprint("    %s, %d edits" % (title.strip(), edit_count))

-
-def makeXmlPageFromRaw(xml) -> str:
-    """Discard the metadata around a <page> element in <mediawiki> string"""
-    root = etree.XML(xml)
-    find = etree.XPath("//*[local-name() = 'page']")
-    # The tag will inherit the namespace, like:
-    # <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-    # FIXME: pretty_print doesn't seem to work, only adds a newline
-    return etree.tostring(find(root)[0], pretty_print=True, encoding="unicode")
-
-
-def makeXmlFromPage(page: dict) -> str:
-    """Output an XML document as a string from a page as in the API JSON"""
-    try:
-        p = E.page(
-            E.title(str(page["title"])),
-            E.ns(str(page["ns"])),
-            E.id(str(page["pageid"])),
-        )
-        for rev in page["revisions"]:
-            # Older releases like MediaWiki 1.16 do not return all fields.
-            if "userid" in rev:
-                userid = rev["userid"]
-            else:
-                userid = 0
-            if "size" in rev:
-                size = rev["size"]
-            else:
-                size = 0
-            text_element = E.text(str(rev["*"]), bytes=str(size))
-            text_element.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
-            revision = E.revision(
-                E.id(str(rev["revid"])),
-                E.timestamp(rev["timestamp"]),
-                text_element,
-            )
-            # The username may be deleted/suppressed
-            if "user" in rev:
-                revision.append(
-                    E.contributor(
-                        E.username(str(rev["user"])),
-                        E.id(str(userid)),
-                    )
-                )
-            else:
-                revision.append(E.contributor(deleted="deleted"))
-            if "comment" in rev and rev["comment"]:
-                revision.append(E.comment(str(rev["comment"])))
-            if "contentmodel" in rev:
-                revision.append(E.model(rev["contentmodel"]))
-            # Sometimes a missing parentid is not replaced with a 0 as it should.
-            if "parentid" in rev:
-                revision.append(E.parentid(str(rev["parentid"])))
-            # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
-            if "sha1" in rev:
-                revision.append(E.sha1(rev["sha1"]))
-            p.append(revision)
-    except KeyError as e:
-        print(e)
-        raise PageMissingError(page["title"], e)
-    return etree.tostring(p, pretty_print=True, encoding="unicode")
-
-
 def fixBOM(request):
    """Strip Unicode BOM"""
    if request.text.startswith("\ufeff"):
--- a/wikiteam3/dumpgenerator/dump/page/xmlrev/init.py
+++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/init.py
--- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py
+++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py
@ -0,0 +1,398 @@
+from datetime import datetime
+from typing import *
+import sys
+import time
+from urllib.parse import urlparse
+import lxml.etree
+
+import mwclient
+import requests
+
+from wikiteam3.dumpgenerator.exceptions import PageMissingError
+from wikiteam3.dumpgenerator.log import logerror
+from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI
+from wikiteam3.dumpgenerator.api.page_titles import readTitles
+from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import makeXmlFromPage, makeXmlPageFromRaw
+from wikiteam3.dumpgenerator.config import Config
+
+def getXMLRevisionsByAllRevisions(config: Config=None, session=None, site: mwclient.Site=None, nscontinue=None, arvcontinue=None):
+    if "all" not in config.namespaces:
+        namespaces = config.namespaces
+    else:
+        namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
+
+    _nscontinue = nscontinue
+    _arvcontinue = arvcontinue
+
+    for namespace in namespaces:
+        if _nscontinue is not None:
+            if namespace != _nscontinue:
+                print("Skipping already exported namespace: %d" % namespace)
+                continue
+            _nscontinue = None
+
+        print("Trying to export all revisions from namespace %s" % namespace)
+        # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
+        arvparams = {
+            "action": "query",
+            "list": "allrevisions",
+            "arvlimit": config.api_chunksize,
+            "arvnamespace": namespace,
+        }
+        if _arvcontinue is not None:
+            arvparams['arvcontinue'] = _arvcontinue
+        if not config.curonly:
+            # We have to build the XML manually...
+            # Skip flags, presumably needed to add <minor/> which is in the schema.
+            # Also missing: parentid and contentformat.
+            arvparams[
+                "arvprop"
+            ] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content"
+            print(
+                "Trying to get wikitext from the allrevisions API and to build the XML"
+            )
+            while True:
+                try:
+                    arvrequest = site.api(
+                        http_method=config.http_method, **arvparams
+                    )
+                except requests.exceptions.HTTPError as e:
+                    if (
+                            e.response.status_code == 405
+                            and config.http_method == "POST"
+                    ):
+                        print("POST request to the API failed, retrying with GET")
+                        config.http_method = "GET"
+                        continue
+                    else:
+                        raise
+                except requests.exceptions.ReadTimeout as err:
+                    # Hopefully temporary, just wait a bit and continue with the same request.
+                    # No point putting a limit to retries, we'd need to abort everything.
+                    # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
+                    # to use the retry adapter we use for our own requests session?
+                    print(f"ERROR: {str(err)}")
+                    print("Sleeping for 20 seconds")
+                    time.sleep(20)
+                    continue
+
+                for page in arvrequest["query"]["allrevisions"]:
+                    yield makeXmlFromPage(page, arvparams.get("arvcontinue", ""))
+                if "continue" in arvrequest:
+                    arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
+                else:
+                    # End of continuation. We are done with this namespace.
+                    break
+
+        else:
+            # FIXME: this is not curonly, just different strategy to do all revisions
+            # Just cycle through revision IDs and use the XML as is
+            print("Trying to list the revisions and to export them one by one")
+            # We only need the revision ID, all the rest will come from the raw export
+            arvparams["arvprop"] = "ids"
+            try:
+                arvrequest = site.api(
+                    http_method=config.http_method, **arvparams
+                )
+            except requests.exceptions.HTTPError as e:
+                if (
+                        e.response.status_code == 405
+                        and config.http_method == "POST"
+                ):
+                    print("POST request to the API failed, retrying with GET")
+                    config.http_method = "GET"
+                    continue
+                else:
+                    raise
+            exportparams = {
+                "action": "query",
+                "export": "1",
+            }
+            # Skip the namespace if it's empty
+            if len(arvrequest["query"]["allrevisions"]) < 1:
+                continue
+            # Repeat the arvrequest with new arvparams until done
+            while True:
+                # Reset revision IDs from the previous batch from arv
+                revids = []
+                for page in arvrequest["query"]["allrevisions"]:
+                    for revision in page["revisions"]:
+                        revids.append(str(revision["revid"]))
+                print(
+                    "        %d more revisions listed, until %s"
+                    % (len(revids), revids[-1])
+                )
+
+                # We can now get the XML for one revision at a time
+                # FIXME: we can actually get them in batches as we used to
+                # but need to figure out the continuation and avoid that the API
+                # chooses to give us only the latest for each page
+                for revid in revids:
+                    exportparams["revids"] = revid
+                    try:
+                        exportrequest = site.api(
+                            http_method=config.http_method, **exportparams
+                        )
+                    except requests.exceptions.HTTPError as e:
+                        if (
+                                e.response.status_code == 405
+                                and config.http_method == "POST"
+                        ):
+                            print(
+                                "POST request to the API failed, retrying with GET"
+                            )
+                            config.http_method = "GET"
+                            exportrequest = site.api(
+                                http_method=config.http_method, **exportparams
+                            )
+                        else:
+                            raise
+
+                    # This gives us a self-standing <mediawiki> element
+                    # but we only need the inner <page>: we can live with
+                    # duplication and non-ordering of page titles, but the
+                    # repeated header is confusing and would not even be valid
+                    xml = exportrequest["query"]["export"]["*"]  # type(xml) == str
+                    yield makeXmlPageFromRaw(xml, arvparams.get("arvcontinue", ""))
+
+                if "continue" in arvrequest:
+                    # Get the new ones
+                    arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
+                    try:
+                        arvrequest = site.api(
+                            http_method=config.http_method, **arvparams
+                        )
+                    except requests.exceptions.HTTPError as e:
+                        if (
+                                e.response.status_code == 405
+                                and config.http_method == "POST"
+                        ):
+                            print(
+                                "POST request to the API failed, retrying with GET"
+                            )
+                            config.http_method = "GET"
+                            arvrequest = site.api(
+                                http_method=config.http_method, **arvparams
+                            )
+                    except requests.exceptions.ReadTimeout as err:
+                        # As above
+                        print(f"ERROR: {str(err)}")
+                        print("Sleeping for 20 seconds")
+                        time.sleep(20)
+                        # But avoid rewriting the same revisions
+                        arvrequest["query"]["allrevisions"] = []
+                        continue
+                else:
+                    # End of continuation. We are done with this namespace.
+                    break
+
+
+def getXMLRevisionsByTitles(config: Config=None, session=None, site: mwclient.Site=None, start=None):
+    if config.curonly:
+        # The raw XML export in the API gets a title and gives the latest revision.
+        # We could also use the allpages API as generator but let's be consistent.
+        print("Getting titles to export the latest revision for each")
+        c = 0
+        for title in readTitles(config, session=session, start=start):
+            # TODO: respect verbose flag, reuse output from getXMLPage
+            print(f"    {title}")
+            # TODO: as we're doing one page and revision at a time, we might
+            # as well use xml format and exportnowrap=1 to use the string of,
+            # XML as is, but need to check how well the library handles it.
+            exportparams = {
+                "action": "query",
+                "titles": title,
+                "export": "1",
+            }
+            try:
+                exportrequest = site.api(
+                    http_method=config.http_method, **exportparams
+                )
+            except requests.exceptions.HTTPError as e:
+                if (
+                        e.response.status_code == 405
+                        and config.http_method == "POST"
+                ):
+                    print("POST request to the API failed, retrying with GET")
+                    config.http_method = "GET"
+                    exportrequest = site.api(
+                        http_method=config.http_method, **exportparams
+                    )
+                else:
+                    raise
+
+            xml = str(exportrequest["query"]["export"]["*"])
+            c += 1
+            if c % 10 == 0:
+                print(f"\n->  Downloaded {c} pages\n")
+            # Because we got the fancy XML from the JSON format, clean it:
+            yield makeXmlPageFromRaw(xml, None)
+    else:
+        # This is the closest to what we usually do with Special:Export:
+        # take one title at a time and try to get all revisions exported.
+        # It differs from the allrevisions method because it actually needs
+        # to be input the page titles; otherwise, the requests are similar.
+        # The XML needs to be made manually because the export=1 option
+        # refuses to return an arbitrary number of revisions (see above).
+        print("Getting titles to export all the revisions of each")
+        c = 0
+        titlelist = []
+        # TODO: Decide a suitable number of a batched request. Careful:
+        # batched responses may not return all revisions.
+        for titlelist in readTitles(config, session=session, start=start, batch=False):
+            if type(titlelist) is not list:
+                titlelist = [titlelist]
+            for title in titlelist:
+                print(f"    {title}")
+            # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
+            # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
+            pparams = {
+                "action": "query",
+                "titles": "|".join(titlelist),
+                "prop": "revisions",
+                # 'rvlimit': 50,
+                "rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content",
+            }
+            try:
+                prequest = site.api(http_method=config.http_method, **pparams)
+            except requests.exceptions.HTTPError as e:
+                if (
+                        e.response.status_code == 405
+                        and config.http_method == "POST"
+                ):
+                    print("POST request to the API failed, retrying with GET")
+                    config.http_method = "GET"
+                    prequest = site.api(
+                        http_method=config.http_method, **pparams
+                    )
+                else:
+                    raise
+            except mwclient.errors.InvalidResponse:
+                logerror(
+                    config=config, to_stdout=True,
+                    text="Error: page inaccessible? Could not export page: %s"
+                         % ("; ".join(titlelist)),
+                )
+                continue
+
+            # Be ready to iterate if there is continuation.
+            while True:
+                # Get the revision data returned by the API: prequest is the initial request
+                # or the new one after continuation at the bottom of this while loop.
+                # The array is called "pages" even if there's only one.
+                try:
+                    pages = prequest["query"]["pages"]
+                except KeyError:
+                    logerror(
+                        config=config, to_stdout=True,
+                        text="Error: page inaccessible? Could not export page: %s"
+                             % ("; ".join(titlelist)),
+                    )
+                    break
+                # Go through the data we got to build the XML.
+                for pageid in pages:
+                    try:
+                        xml = makeXmlFromPage(pages[pageid], None)
+                        yield xml
+                    except PageMissingError:
+                        logerror(
+                            config=config, to_stdout=True,
+                            text="Error: empty revision from API. Could not export page: %s"
+                                 % ("; ".join(titlelist)),
+                        )
+                        continue
+
+                # Get next batch of revisions if there's more.
+                if "continue" in prequest.keys():
+                    print("Getting more revisions for the page")
+                    for key, value in prequest["continue"]:
+                        pparams[key] = value
+                elif "query-continue" in prequest.keys():
+                    rvstartid = prequest["query-continue"]["revisions"]["rvstartid"]
+                    pparams["rvstartid"] = rvstartid
+                else:
+                    break
+
+                try:
+                    prequest = site.api(
+                        http_method=config.http_method, **pparams
+                    )
+                except requests.exceptions.HTTPError as e:
+                    if (
+                            e.response.status_code == 405
+                            and config.http_method == "POST"
+                    ):
+                        print("POST request to the API failed, retrying with GET")
+                        config.http_method = "GET"
+                        prequest = site.api(
+                            http_method=config.http_method, **pparams
+                        )
+
+            # We're done iterating for this title or titles.
+            c += len(titlelist)
+            # Reset for the next batch.
+            titlelist = []
+            if c % 10 == 0:
+                print(f"\n->  Downloaded {c} pages\n")
+
+
+def getXMLRevisions(config: Config=None, session=None, useAllrevision=True, lastPage=None):
+    # FIXME: actually figure out the various strategies for each MediaWiki version
+    apiurl = urlparse(config.api)
+    # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
+    # https://github.com/WikiTeam/wikiteam/issues/358
+    site = mwclient.Site(
+        apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session
+    )
+
+    if useAllrevision:
+        # Find last title
+        if lastPage:
+            try:
+                lastNs = int(lastPage.find('ns').text)
+                if False:
+                    lastRevision = lastPage.find('revision')
+                    lastTimestamp = lastRevision.find('timestamp').text
+                    lastRevid = int(lastRevision.find('id').text)
+                    lastDatetime = datetime.fromisoformat(lastTimestamp.rstrip('Z'))
+                    lastArvcontinue = lastDatetime.strftime("%Y%m%d%H%M%S") + '|' + str(lastRevid)
+                else:
+                    lastArvcontinue = lastPage.attrib['arvcontinue']
+            except Exception:
+                print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
+                raise
+            nscontinue = lastNs
+            arvcontinue = lastArvcontinue
+            if not arvcontinue:
+                arvcontinue = None
+        else:
+            nscontinue = None
+            arvcontinue = None
+
+        try:
+            return getXMLRevisionsByAllRevisions(config, session, site, nscontinue, arvcontinue)
+        except (KeyError, mwclient.errors.InvalidResponse) as e:
+            print(e)
+            # TODO: check whether the KeyError was really for a missing arv API
+            print("Warning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page")
+            sys.exit()
+    else:
+        # Find last title
+        if lastPage:
+            try:
+                start = lastPage.find('title')
+            except Exception:
+                print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
+                raise
+        else:
+            start = None
+
+        try:
+            # # Uncomment these lines to raise an KeyError for testing
+            # raise KeyError(999999)
+            # # DO NOT UNCOMMMENT IN RELEASE
+            return getXMLRevisionsByTitles(config, session, site, start)
+        except mwclient.errors.MwClientError as e:
+            print(e)
+            print("This mwclient version seems not to work for us. Exiting.")
+            sys.exit()
--- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py
+++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py
@ -0,0 +1,104 @@
+from lxml import etree
+from lxml.builder import E
+
+from wikiteam3.dumpgenerator.exceptions import PageMissingError
+
+def makeXmlPageFromRaw(xml, arvcontinue) -> str:
+    """Discard the metadata around a <page> element in <mediawiki> string"""
+    root = etree.XML(xml)
+    find = etree.XPath("//*[local-name() = 'page']")
+    page = find(root)[0]
+    if arvcontinue is not None:
+        page.attrib['arvcontinue'] = arvcontinue
+    # The tag will inherit the namespace, like:
+    # <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    # FIXME: pretty_print doesn't seem to work, only adds a newline
+    return etree.tostring(page, pretty_print=True, encoding="unicode")
+
+
+def makeXmlFromPage(page: dict, arvcontinue) -> str:
+    """Output an XML document as a string from a page as in the API JSON"""
+    try:
+        p = E.page(
+            E.title(str(page["title"])),
+            E.ns(str(page["ns"])),
+            E.id(str(page["pageid"])),
+        )
+        if arvcontinue is not None:
+            p.attrib['arvcontinue'] = arvcontinue
+        for rev in page["revisions"]:
+            # Older releases like MediaWiki 1.16 do not return all fields.
+            if "userid" in rev:
+                userid = rev["userid"]
+            else:
+                userid = 0
+            if "size" in rev:
+                size = rev["size"]
+            else:
+                size = 0
+
+            # Create rev object
+            revision = [E.id(str(rev["revid"])),
+                E.timestamp(rev["timestamp"]),]
+
+            # The text, user, comment, sha1 may be deleted/suppressed
+            if 'texthidden' in rev:
+                revision.append(E.text(**{
+                    'bytes': str(size),
+                    'deleted': 'deleted',
+                }))
+            else:
+                text = str(rev["*"])
+                revision.append(E.text(text, **{
+                    'bytes': str(size),
+                    '{http://www.w3.org/XML/1998/namespace}space': 'preserve',
+                }))
+
+            if not "user" in rev:
+                if not "userhidden" in rev:
+                    print("Warning: user not hidden but missing user in pageid %d revid %d" % (page['pageid'], rev['revid']))
+                revision.append(E.contributor(deleted="deleted"))
+            else:
+                revision.append(
+                    E.contributor(
+                        E.username(str(rev["user"])),
+                        E.id(str(userid)),
+                    )
+                )
+
+            if not "sha1" in rev:
+                if "sha1hidden" in rev:
+                    revision.append(E.sha1()) # stub
+                else:
+                    # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
+                    pass
+            elif "sha1" in rev:
+                revision.append(E.sha1(rev["sha1"]))
+
+
+            if 'commenthidden' in rev:
+                revision.append(E.comment(deleted="deleted"))
+            elif "comment" in rev and rev["comment"]:
+                revision.append(E.comment(str(rev["comment"])))
+
+            if "contentmodel" in rev:
+                revision.append(E.model(rev["contentmodel"]))
+            # Sometimes a missing parentid is not replaced with a 0 as it should.
+            if "parentid" in rev:
+                revision.append(E.parentid(str(rev["parentid"])))
+
+            # mwcli's dump.xml order
+            revisionTags = ['id', 'parentid', 'timestamp', 'contributor', 'comment', 'origin', 'model', 'format', 'text', 'sha1']
+            revisionElementsDict = {elem.tag: elem for elem in revision}
+            _revision = E.revision()
+            for tag in revisionTags:
+                if tag in revisionElementsDict:
+                    _revision.append(revisionElementsDict.pop(tag))
+            for elem in revisionElementsDict.values():
+                _revision.append(elem)
+            p.append(_revision)
+    except KeyError as e:
+        print(e)
+        raise PageMissingError(page["title"], e)
+    return etree.tostring(p, pretty_print=True, encoding="unicode")
+
--- a/wikiteam3/dumpgenerator/dump/xmldump/init.py
+++ b/wikiteam3/dumpgenerator/dump/xmldump/init.py
--- a/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py
+++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py
@ -0,0 +1,144 @@
+import re
+import sys
+from typing import *
+
+import lxml.etree
+
+from wikiteam3.dumpgenerator.cli import Delay
+from wikiteam3.utils import domain2prefix
+from wikiteam3.dumpgenerator.exceptions import PageMissingError
+from wikiteam3.dumpgenerator.log import logerror
+from wikiteam3.dumpgenerator.api.page_titles import readTitles
+from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
+from wikiteam3.dumpgenerator.config import Config
+from wikiteam3.utils import cleanXML, undoHTMLEntities
+from wikiteam3.dumpgenerator.dump.xmldump.xml_header import getXMLHeader
+from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions import getXMLRevisions
+from wikiteam3.dumpgenerator.dump.xmldump.xml_truncate import truncateXMLDump, parseLastPageChunk
+
+def doXMLRevisionDump(config: Config=None, session=None, xmlfile=None, lastPage=None, useAllrevisions=False):
+    try:
+        r_timestamp = "<timestamp>([^<]+)</timestamp>"
+        r_arvcontinue = '<page arvcontinue="(.*?)">'
+
+        lastArvcontinue = None
+        for xml in getXMLRevisions(config=config, session=session, lastPage=lastPage, useAllrevision=useAllrevisions):
+            numrevs = len(re.findall(r_timestamp, xml))
+            arvcontinueRe = re.findall(r_arvcontinue, xml)
+            if arvcontinueRe:
+                curArvcontinue = arvcontinueRe[0]
+                if lastArvcontinue != curArvcontinue:
+                    Delay(config=config, session=session)
+                    lastArvcontinue = curArvcontinue
+            # Due to how generators work, it's expected this may be less
+            xml = cleanXML(xml=xml)
+            xmlfile.write(xml)
+
+            xmltitle = re.search(r"<title>([^<]+)</title>", xml)
+            title = undoHTMLEntities(text=xmltitle.group(1))
+            print(f'{title}, {numrevs} edits (--xmlrevisions)')
+            # Delay(config=config, session=session)
+    except AttributeError as e:
+        print(e)
+        print("This API library version is not working")
+        sys.exit()
+    except UnicodeEncodeError as e:
+        print(e)
+
+def doXMLExportDump(config: Config=None, session=None, xmlfile=None, lastPage=None):
+    print(
+        '\nRetrieving the XML for every page\n'
+    )
+
+    lock = True
+    start = None
+    if lastPage:
+        try:
+            start = lastPage.find('title')
+        except Exception:
+            print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
+            raise
+    else:
+        # requested complete xml dump
+        lock = False
+
+    c = 1
+    for title in readTitles(config, session=session, start=start):
+        if not title:
+            continue
+        if title == start:  # start downloading from start, included
+            lock = False
+        if lock:
+            continue
+        Delay(config=config, session=session)
+        if c % 10 == 0:
+            print(f"\n->  Downloaded {c} pages\n")
+        try:
+            for xml in getXMLPage(config=config, title=title, session=session):
+                xml = cleanXML(xml=xml)
+                xmlfile.write(xml)
+        except PageMissingError:
+            logerror(
+                config=config, to_stdout=True,
+                text='The page "%s" was missing in the wiki (probably deleted)'
+                     % title,
+            )
+        # here, XML is a correct <page> </page> chunk or
+        # an empty string due to a deleted page (logged in errors log) or
+        # an empty string due to an error while retrieving the page from server
+        # (logged in errors log)
+        c += 1
+
+
+def generateXMLDump(config: Config=None, resume=False, session=None):
+    """Generates a XML dump for a list of titles or from revision IDs"""
+
+    header, config = getXMLHeader(config=config, session=session)
+    footer = "</mediawiki>\n"  # new line at the end
+    xmlfilename = "{}-{}-{}.xml".format(
+        domain2prefix(config=config),
+        config.date,
+        "current" if config.curonly else "history",
+    )
+    xmlfile = None
+
+    lastPage = None
+    lastPageChunk = None
+    # start != None, means we are resuming a XML dump
+    if resume:
+        print(
+            "Removing the last chunk of past XML dump: it is probably incomplete."
+        )
+        # truncate XML dump if it already exists
+        lastPageChunk = truncateXMLDump("{}/{}".format(config.path, xmlfilename))
+        if not lastPageChunk.strip():
+            print("Last page chunk is NULL, we'll directly start a new dump!")
+            resume = False
+            lastPage = None
+        else:
+            lastPage = parseLastPageChunk(lastPageChunk)
+            if lastPage is None:
+                print("Failed to parse last page chunk: \n%s" % lastPageChunk)
+                print("Cannot resume, exiting now!")
+                sys.exit(1)
+
+        print(f"WARNING: will try to start the download...")
+        xmlfile = open(
+            "{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
+        )
+    else:
+        print("\nRetrieving the XML for every page from the beginning\n")
+        xmlfile = open(
+            "{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
+        )
+        xmlfile.write(header)
+
+    if config.xmlrevisions and not config.xmlrevisions_page:
+        doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=True)
+    elif config.xmlrevisions and config.xmlrevisions_page:
+        doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=False)
+    else:  # --xml
+        doXMLExportDump(config, session, xmlfile, lastPage)
+    xmlfile.write(footer)
+    xmlfile.close()
+    print("XML dump saved at...", xmlfilename)
--- a/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py
+++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py
@ -7,7 +7,7 @@ import requests

 from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError
 from wikiteam3.dumpgenerator.log import logerror
-from wikiteam3.dumpgenerator.dump.page.page_xml import getXMLPage
+from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
 from wikiteam3.dumpgenerator.config import Config

 def getXMLHeader(config: Config=None, session=None) -> Tuple[str, Config]:
--- a/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py
+++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py
@ -3,6 +3,7 @@ from wikiteam3.dumpgenerator.config import Config

 def checkXMLIntegrity(config: Config=None, titles: Iterable[str]=None, session=None):
    """Check XML dump integrity, to detect broken XML chunks"""
+    # TODO: Fix XML Integrity Check
    return

    print("Verifying dump...")
--- a/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py
+++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py
@ -1,5 +1,8 @@
+from io import StringIO
+from typing import *
 import os

+import lxml.etree
 from file_read_backwards import FileReadBackwards


@ -21,17 +24,17 @@ def addNewline(filename: str) -> None:
        f.write("\n")


-def truncateXMLDump(filename: str) -> None:
+def truncateXMLDump(filename: str) -> str:
    """Removes incomplete <page> elements from the end of XML dump files"""

    with FileReadBackwards(filename, encoding="utf-8") as frb:
        incomplete_segment: str = ""
        xml_line: str = frb.readline()
        while xml_line and "</title>" not in xml_line:
-            incomplete_segment += xml_line
+            incomplete_segment = xml_line + incomplete_segment
            xml_line = frb.readline()
        while xml_line and "</page>" not in xml_line:
-            incomplete_segment += xml_line
+            incomplete_segment = xml_line + incomplete_segment
            xml_line = frb.readline()
    incomplete_segment_size = len(incomplete_segment.encode("utf-8"))
    file_size = os.path.getsize(filename)
@ -56,3 +59,12 @@ def truncateXMLDump(filename: str) -> None:
        print(
            f"WARNING: {filename} has {endsWithNewlines(filename)} newlines"
        )
+    return incomplete_segment
+
+def parseLastPageChunk(chunk) -> Optional[lxml.etree._ElementTree]:
+    try:
+        parser = lxml.etree.XMLParser(recover=True)
+        tree = lxml.etree.parse(StringIO(chunk), parser)
+        return tree.getroot()
+    except lxml.etree.LxmlError:
+        return None
--- a/wikiteam3/dumpgenerator/dump/xmlrev/xml_dump.py
+++ b/wikiteam3/dumpgenerator/dump/xmlrev/xml_dump.py
@ -1,117 +0,0 @@
-import re
-import sys
-from typing import *
-
-from wikiteam3.dumpgenerator.cli import Delay
-from wikiteam3.utils import domain2prefix
-from wikiteam3.dumpgenerator.exceptions import PageMissingError
-from wikiteam3.dumpgenerator.log import logerror
-from wikiteam3.dumpgenerator.dump.page.page_titles import readTitles
-from wikiteam3.dumpgenerator.dump.page.page_xml import getXMLPage
-from wikiteam3.dumpgenerator.config import Config
-from wikiteam3.utils import cleanXML, undoHTMLEntities
-from .xml_header import getXMLHeader
-from .xml_revisions import getXMLRevisions
-from .xml_truncate import truncateXMLDump
-
-def generateXMLDump(config: Config=None, titles: Iterable[str]=None, start=None, session=None):
-    """Generates a XML dump for a list of titles or from revision IDs"""
-    # TODO: titles is now unused.
-
-    header, config = getXMLHeader(config=config, session=session)
-    footer = "</mediawiki>\n"  # new line at the end
-    xmlfilename = "{}-{}-{}.xml".format(
-        domain2prefix(config=config),
-        config.date,
-        "current" if config.curonly else "history",
-    )
-    xmlfile = ""
-    lock = True
-
-    # start != None, means we are resuming a XML dump
-    if start:
-        print(
-            "Removing the last chunk of past XML dump: it is probably incomplete."
-        )
-        # truncate XML dump if it already exists
-        truncateXMLDump("{}/{}".format(config.path, xmlfilename))
-
-    if config.xmlrevisions:
-        if start:
-            print(f"WARNING: will try to start the download from title: {start}")
-            xmlfile = open(
-                "{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
-            )
-        else:
-            print("\nRetrieving the XML for every page from the beginning\n")
-            xmlfile = open(
-                "{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
-            )
-            xmlfile.write(header)
-        try:
-            r_timestamp = "<timestamp>([^<]+)</timestamp>"
-            for xml in getXMLRevisions(config=config, session=session, start=start):
-                numrevs = len(re.findall(r_timestamp, xml))
-                # Due to how generators work, it's expected this may be less
-                xml = cleanXML(xml=xml)
-                xmlfile.write(xml)
-
-                xmltitle = re.search(r"<title>([^<]+)</title>", xml)
-                title = undoHTMLEntities(text=xmltitle.group(1))
-                print(f'{title}, {numrevs} edits (--xmlrevisions)')
-                Delay(config=config, session=session)
-        except AttributeError as e:
-            print(e)
-            print("This API library version is not working")
-            sys.exit()
-        except UnicodeEncodeError as e:
-            print(e)
-
-    else:  # --xml
-        print(
-            '\nRetrieving the XML for every page from "%s"\n'
-            % (start if start else "start")
-        )
-
-        if not start:
-            # requested complete xml dump
-            lock = False
-            xmlfile = open(
-                "{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
-            )
-            xmlfile.write(header)
-            xmlfile.close()
-
-        xmlfile = open(
-            "{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
-        )
-        c = 1
-        for title in readTitles(config, start):
-            if not title:
-                continue
-            if title == start:  # start downloading from start, included
-                lock = False
-            if lock:
-                continue
-            Delay(config=config, session=session)
-            if c % 10 == 0:
-                print(f"\n->  Downloaded {c} pages\n")
-            try:
-                for xml in getXMLPage(config=config, title=title, session=session):
-                    xml = cleanXML(xml=xml)
-                    xmlfile.write(xml)
-            except PageMissingError:
-                logerror(
-                    config=config, to_stdout=True,
-                    text='The page "%s" was missing in the wiki (probably deleted)'
-                    % title,
-                )
-            # here, XML is a correct <page> </page> chunk or
-            # an empty string due to a deleted page (logged in errors log) or
-            # an empty string due to an error while retrieving the page from server
-            # (logged in errors log)
-            c += 1
-
-    xmlfile.write(footer)
-    xmlfile.close()
-    print("XML dump saved at...", xmlfilename)
--- a/wikiteam3/dumpgenerator/dump/xmlrev/xml_revisions.py
+++ b/wikiteam3/dumpgenerator/dump/xmlrev/xml_revisions.py
@ -1,342 +0,0 @@
-import sys
-import time
-from urllib.parse import urlparse
-
-import mwclient
-import requests
-
-from wikiteam3.dumpgenerator.exceptions import PageMissingError
-from wikiteam3.dumpgenerator.log import logerror
-from .namespaces import getNamespacesAPI
-from wikiteam3.dumpgenerator.dump.page.page_titles import readTitles
-from wikiteam3.dumpgenerator.dump.page.page_xml import makeXmlFromPage, makeXmlPageFromRaw
-from wikiteam3.dumpgenerator.config import Config
-
-def getXMLRevisions(config: Config=None, session=None, allpages=False, start=None):
-    # FIXME: actually figure out the various strategies for each MediaWiki version
-    apiurl = urlparse(config.api)
-    # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
-    # https://github.com/WikiTeam/wikiteam/issues/358
-    site = mwclient.Site(
-        apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session
-    )
-
-    if "all" not in config.namespaces:
-        namespaces = config.namespaces
-    else:
-        namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
-
-    try:
-        # # Uncomment these lines to raise an KeyError for testing
-        # raise KeyError(999999)
-        # # DO NOT UNCOMMMENT IN RELEASE
-
-        for namespace in namespaces:
-            print("Trying to export all revisions from namespace %s" % namespace)
-            # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
-            arvparams = {
-                "action": "query",
-                "list": "allrevisions",
-                "arvlimit": 50,
-                "arvnamespace": namespace,
-            }
-            if not config.curonly:
-                # We have to build the XML manually...
-                # Skip flags, presumably needed to add <minor/> which is in the schema.
-                # Also missing: parentid and contentformat.
-                arvparams[
-                    "arvprop"
-                ] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content"
-                print(
-                    "Trying to get wikitext from the allrevisions API and to build the XML"
-                )
-                while True:
-                    try:
-                        arvrequest = site.api(
-                            http_method=config.http_method, **arvparams
-                        )
-                    except requests.exceptions.HTTPError as e:
-                        if (
-                            e.response.status_code == 405
-                            and config.http_method == "POST"
-                        ):
-                            print("POST request to the API failed, retrying with GET")
-                            config.http_method = "GET"
-                            continue
-                        else:
-                            raise
-                    except requests.exceptions.ReadTimeout as err:
-                        # Hopefully temporary, just wait a bit and continue with the same request.
-                        # No point putting a limit to retries, we'd need to abort everything.
-                        # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
-                        # to use the retry adapter we use for our own requests session?
-                        print(f"ERROR: {str(err)}")
-                        print("Sleeping for 20 seconds")
-                        time.sleep(20)
-                        continue
-
-                    for page in arvrequest["query"]["allrevisions"]:
-                        yield makeXmlFromPage(page)
-                    if "continue" in arvrequest:
-                        arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
-                    else:
-                        # End of continuation. We are done with this namespace.
-                        break
-
-            else:
-                # FIXME: this is not curonly, just different strategy to do all revisions
-                # Just cycle through revision IDs and use the XML as is
-                print("Trying to list the revisions and to export them one by one")
-                # We only need the revision ID, all the rest will come from the raw export
-                arvparams["arvprop"] = "ids"
-                try:
-                    arvrequest = site.api(
-                        http_method=config.http_method, **arvparams
-                    )
-                except requests.exceptions.HTTPError as e:
-                    if (
-                        e.response.status_code == 405
-                        and config.http_method == "POST"
-                    ):
-                        print("POST request to the API failed, retrying with GET")
-                        config.http_method = "GET"
-                        continue
-                    else:
-                        raise
-                exportparams = {
-                    "action": "query",
-                    "export": "1",
-                }
-                # Skip the namespace if it's empty
-                if len(arvrequest["query"]["allrevisions"]) < 1:
-                    continue
-                # Repeat the arvrequest with new arvparams until done
-                while True:
-                    # Reset revision IDs from the previous batch from arv
-                    revids = []
-                    for page in arvrequest["query"]["allrevisions"]:
-                        for revision in page["revisions"]:
-                            revids.append(str(revision["revid"]))
-                    print(
-                        "        %d more revisions listed, until %s"
-                        % (len(revids), revids[-1])
-                    )
-
-                    # We can now get the XML for one revision at a time
-                    # FIXME: we can actually get them in batches as we used to
-                    # but need to figure out the continuation and avoid that the API
-                    # chooses to give us only the latest for each page
-                    for revid in revids:
-                        exportparams["revids"] = revid
-                        try:
-                            exportrequest = site.api(
-                                http_method=config.http_method, **exportparams
-                            )
-                        except requests.exceptions.HTTPError as e:
-                            if (
-                                e.response.status_code == 405
-                                and config.http_method == "POST"
-                            ):
-                                print(
-                                    "POST request to the API failed, retrying with GET"
-                                )
-                                config.http_method = "GET"
-                                exportrequest = site.api(
-                                    http_method=config.http_method, **exportparams
-                                )
-                            else:
-                                raise
-
-                        # This gives us a self-standing <mediawiki> element
-                        # but we only need the inner <page>: we can live with
-                        # duplication and non-ordering of page titles, but the
-                        # repeated header is confusing and would not even be valid
-                        xml = exportrequest["query"]["export"]["*"]  # type(xml) == str
-                        yield makeXmlPageFromRaw(xml)
-
-                    if "continue" in arvrequest:
-                        # Get the new ones
-                        arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
-                        try:
-                            arvrequest = site.api(
-                                http_method=config.http_method, **arvparams
-                            )
-                        except requests.exceptions.HTTPError as e:
-                            if (
-                                e.response.status_code == 405
-                                and config.http_method == "POST"
-                            ):
-                                print(
-                                    "POST request to the API failed, retrying with GET"
-                                )
-                                config.http_method = "GET"
-                                arvrequest = site.api(
-                                    http_method=config.http_method, **arvparams
-                                )
-                        except requests.exceptions.ReadTimeout as err:
-                            # As above
-                            print(f"ERROR: {str(err)}")
-                            print("Sleeping for 20 seconds")
-                            time.sleep(20)
-                            # But avoid rewriting the same revisions
-                            arvrequest["query"]["allrevisions"] = []
-                            continue
-                    else:
-                        # End of continuation. We are done with this namespace.
-                        break
-
-    except (KeyError, mwclient.errors.InvalidResponse) as e:
-        print(e)
-        # TODO: check whether the KeyError was really for a missing arv API
-        print("Warning. Could not use allrevisions. Wiki too old?")
-        if config.curonly:
-            # The raw XML export in the API gets a title and gives the latest revision.
-            # We could also use the allpages API as generator but let's be consistent.
-            print("Getting titles to export the latest revision for each")
-            c = 0
-            for title in readTitles(config, start=start):
-                # TODO: respect verbose flag, reuse output from getXMLPage
-                print(f"    {title}")
-                # TODO: as we're doing one page and revision at a time, we might
-                # as well use xml format and exportnowrap=1 to use the string of,
-                # XML as is, but need to check how well the library handles it.
-                exportparams = {
-                    "action": "query",
-                    "titles": title,
-                    "export": "1",
-                }
-                try:
-                    exportrequest = site.api(
-                        http_method=config.http_method, **exportparams
-                    )
-                except requests.exceptions.HTTPError as e:
-                    if (
-                        e.response.status_code == 405
-                        and config.http_method == "POST"
-                    ):
-                        print("POST request to the API failed, retrying with GET")
-                        config.http_method = "GET"
-                        exportrequest = site.api(
-                            http_method=config.http_method, **exportparams
-                        )
-                    else:
-                        raise
-
-                xml = str(exportrequest["query"]["export"]["*"])
-                c += 1
-                if c % 10 == 0:
-                    print(f"\n->  Downloaded {c} pages\n")
-                # Because we got the fancy XML from the JSON format, clean it:
-                yield makeXmlPageFromRaw(xml)
-        else:
-            # This is the closest to what we usually do with Special:Export:
-            # take one title at a time and try to get all revisions exported.
-            # It differs from the allrevisions method because it actually needs
-            # to be input the page titles; otherwise, the requests are similar.
-            # The XML needs to be made manually because the export=1 option
-            # refuses to return an arbitrary number of revisions (see above).
-            print("Getting titles to export all the revisions of each")
-            c = 0
-            titlelist = []
-            # TODO: Decide a suitable number of a batched request. Careful:
-            # batched responses may not return all revisions.
-            for titlelist in readTitles(config, start=start, batch=False):
-                if type(titlelist) is not list:
-                    titlelist = [titlelist]
-                for title in titlelist:
-                    print(f"    {title}")
-                # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
-                # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
-                pparams = {
-                    "action": "query",
-                    "titles": "|".join(titlelist),
-                    "prop": "revisions",
-                    # 'rvlimit': 50,
-                    "rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content",
-                }
-                try:
-                    prequest = site.api(http_method=config.http_method, **pparams)
-                except requests.exceptions.HTTPError as e:
-                    if (
-                        e.response.status_code == 405
-                        and config.http_method == "POST"
-                    ):
-                        print("POST request to the API failed, retrying with GET")
-                        config.http_method = "GET"
-                        prequest = site.api(
-                            http_method=config.http_method, **pparams
-                        )
-                    else:
-                        raise
-                except mwclient.errors.InvalidResponse:
-                    logerror(
-                        config=config, to_stdout=True,
-                        text="Error: page inaccessible? Could not export page: %s"
-                        % ("; ".join(titlelist)),
-                    )
-                    continue
-
-                # Be ready to iterate if there is continuation.
-                while True:
-                    # Get the revision data returned by the API: prequest is the initial request
-                    # or the new one after continuation at the bottom of this while loop.
-                    # The array is called "pages" even if there's only one.
-                    try:
-                        pages = prequest["query"]["pages"]
-                    except KeyError:
-                        logerror(
-                            config=config, to_stdout=True,
-                            text="Error: page inaccessible? Could not export page: %s"
-                            % ("; ".join(titlelist)),
-                        )
-                        break
-                    # Go through the data we got to build the XML.
-                    for pageid in pages:
-                        try:
-                            xml = makeXmlFromPage(pages[pageid])
-                            yield xml
-                        except PageMissingError:
-                            logerror(
-                                config=config, to_stdout=True,
-                                text="Error: empty revision from API. Could not export page: %s"
-                                % ("; ".join(titlelist)),
-                            )
-                            continue
-
-                    # Get next batch of revisions if there's more.
-                    if "continue" in prequest.keys():
-                        print("Getting more revisions for the page")
-                        for key, value in prequest["continue"]:
-                            pparams[key] = value
-                    elif "query-continue" in prequest.keys():
-                        rvstartid = prequest["query-continue"]["revisions"]["rvstartid"]
-                        pparams["rvstartid"] = rvstartid
-                    else:
-                        break
-
-                    try:
-                        prequest = site.api(
-                            http_method=config.http_method, **pparams
-                        )
-                    except requests.exceptions.HTTPError as e:
-                        if (
-                            e.response.status_code == 405
-                            and config.http_method == "POST"
-                        ):
-                            print("POST request to the API failed, retrying with GET")
-                            config.http_method = "GET"
-                            prequest = site.api(
-                                http_method=config.http_method, **pparams
-                            )
-
-                # We're done iterating for this title or titles.
-                c += len(titlelist)
-                # Reset for the next batch.
-                titlelist = []
-                if c % 10 == 0:
-                    print(f"\n->  Downloaded {c} pages\n")
-
-    except mwclient.errors.MwClientError as e:
-        print(e)
-        print("This mwclient version seems not to work for us. Exiting.")
-        sys.exit()