Merge branch 'master' into wikia

4 years ago · 8a2116699e
parent 7289225d2c e136ee5536
commit 8a2116699e
2 changed files with 94 additions and 47 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -715,16 +715,20 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
    lock = True

    if config['xmlrevisions']:
-        print 'Retrieving the XML for every page from the beginning'
-        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
-        xmlfile.write(header.encode('utf-8'))
+        if start:
+            print("WARNING: will try to start the download from title: {}".format(start))
+            xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
+        else:
+            print 'Retrieving the XML for every page from the beginning'
+            xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
+            xmlfile.write(header.encode('utf-8'))
        try:
            r_timestamp = r'<timestamp>([^<]+)</timestamp>'
-            for xml in getXMLRevisions(config=config, session=session):
+            for xml in getXMLRevisions(config=config, session=session, start=start):
                numrevs = len(re.findall(r_timestamp, xml))
                # Due to how generators work, it's expected this may be less
                # TODO: get the page title and reuse the usual format "X title, y edits"
-                print "%d more revisions exported" % numrevs
+                print "        %d more revisions exported" % numrevs
                xml = cleanXML(xml=xml)
                xmlfile.write(xml.encode('utf-8'))
        except AttributeError as e:
@ -776,12 +780,13 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename

-def getXMLRevisions(config={}, session=None, allpages=False):
+def getXMLRevisions(config={}, session=None, allpages=False, start=None):
    # FIXME: actually figure out the various strategies for each MediaWiki version
    apiurl = urlparse(config['api'])
    # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
    # https://github.com/WikiTeam/wikiteam/issues/358
    site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme)
+
    if not 'all' in config['namespaces']:
        namespaces = config['namespaces']
    else:
@ -789,12 +794,12 @@ def getXMLRevisions(config={}, session=None, allpages=False):

    try:
        for namespace in namespaces:
-            print "Trying to export all revisions from namespace %s" % namespace
+            print("Trying to export all revisions from namespace %s" % namespace)
            # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
            arvparams = {
                'action': 'query',
                'list': 'allrevisions',
-                'arvlimit': 500,
+                'arvlimit': 50,
                'arvnamespace': namespace
            }
            if not config['curonly']:
@ -856,7 +861,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                    for page in arvrequest['query']['allrevisions']:
                        for revision in page['revisions']:
                            revids.append(str(revision['revid']))
-                    print "%d more revisions listed, until %s" % (len(revids), revids[-1])
+                    print "        %d more revisions listed, until %s" % (len(revids), revids[-1])

                    # We can now get the XML for one revision at a time
                    # FIXME: we can actually get them in batches as we used to
@ -909,7 +914,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
            # We could also use the allpages API as generator but let's be consistent.
            print("Getting titles to export the latest revision for each")
            c = 0
-            for title in readTitles(config):
+            for title in readTitles(config, start=start):
                # TODO: respect verbose flag, reuse output from getXMLPage
                print('    {}'.format(title.strip()))
                # TODO: as we're doing one page and revision at a time, we might
@ -943,7 +948,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
            # refuses to return an arbitrary number of revisions (see above).
            print("Getting titles to export all the revisions of each")
            c = 0
-            for title in readTitles(config):
+            for title in readTitles(config, start=start):
                print('    {}'.format(title.strip()))
                # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
                # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
@ -977,10 +982,16 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                    continue
                # Be ready to iterate if there is continuation.
                while True:
+                    # The array is called "pages" even if there's only one.
+                    # TODO: we could actually batch titles a bit here if desired. How many?
+                    try:
+                        pages = prequest['query']['pages']
+                    except KeyError:
+                        raise PageMissingError(title, xml='')
                    # Go through the data we got to build the XML.
-                    for page in pages:
+                    for pageid in pages:
                        try:
-                            xml = makeXmlFromPage(pages[page])
+                            xml = makeXmlFromPage(pages[pageid])
                            yield xml
                        except PageMissingError:
                            logerror(
@ -993,19 +1004,24 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                    if 'continue' in prequest.keys():
                        print("Getting more revisions for page {}".format(title))
                        pparams['rvcontinue'] = prequest['continue']['rvcontinue']
-                        try:
-                            prequest = site.api(http_method=config['http_method'], **pparams)
-                        except requests.exceptions.HTTPError as e:
-                            if e.response.status_code == 405 and config['http_method'] == "POST":
-                                print("POST request to the API failed, retrying with GET")
-                                config['http_method'] = "GET"
-                                prequest = site.api(http_method=config['http_method'], **pparams)
-                    # mwclient seems to rewrite query-continue
-                    #if 'query-continue' in prequest.keys():
-                    #    pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue']
+                    elif 'query-continue' in prequest.keys():
+                        rvstartid = prequest['query-continue']['revisions']['rvstartid']
+                        pparams['rvstartid'] = rvstartid
                    else:
                        break

+                    try:
+                        prequest = site.api(http_method=config['http_method'], **pparams)
+                    except requests.exceptions.HTTPError as e:
+                        if e.response.status_code == 405 and config['http_method'] == "POST":
+                            print("POST request to the API failed, retrying with GET")
+                            config['http_method'] = "GET"
+                            prequest = site.api(http_method=config['http_method'], **pparams)
+
+                c += 1
+                if c % 10 == 0:
+                    print('Downloaded {} pages'.format(c))
+

    except mwclient.errors.MwClientError as e:
        print(e)
@ -1034,12 +1050,16 @@ def makeXmlFromPage(page):
                E.id(to_unicode(rev['revid'])),
                E.parentid(to_unicode(rev['parentid'])),
                E.timestamp(rev['timestamp']),
-                E.contributor(
-                        E.id(to_unicode(userid)),
-                        E.username(to_unicode(rev['user'])),
-                ),
-                E.text(rev['*'], space="preserve", bytes=to_unicode(size)),
+                E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)),
            )
+            # The username may be deleted/suppressed
+            if 'user' in rev:
+                revision.append(E.contributor(
+                        E.username(to_unicode(rev['user'])),
+                        E.id(to_unicode(userid)),
+                ))
+            else:
+                revision.append(E.contributor(deleted="deleted"))
            if 'comment' in rev:
                revision.append(E.comment(to_unicode(rev['comment'])))
            if 'contentmodel' in rev:
@ -1051,7 +1071,7 @@ def makeXmlFromPage(page):
    except KeyError as e:
        print(e)
        raise PageMissingError(page['title'], e)
-    return etree.tostring(p, pretty_print=True)
+    return etree.tostring(p, pretty_print=True, encoding='unicode')

 def readTitles(config={}, start=None):
    """ Read title list from a file, from the title "start" """
@ -1291,7 +1311,7 @@ def getImageNamesAPI(config={}, session=None):
            'aiprop': 'url|user',
            'aifrom': aifrom,
            'format': 'json',
-            'ailimit': 500}
+            'ailimit': 50}
        # FIXME Handle HTTP Errors HERE
        r = session.get(url=config['api'], params=params, timeout=30)
        handleStatusCode(r)
@ -1345,7 +1365,7 @@ def getImageNamesAPI(config={}, session=None):
                'action': 'query',
                'generator': 'allpages',
                'gapnamespace': 6,
-                'gaplimit': 500,
+                'gaplimit': 50,
                'gapfrom': gapfrom,
                'prop': 'imageinfo',
                'iiprop': 'user|url',
@ -1737,23 +1757,13 @@ def getParameters(params=[]):
    index2 = None

    if api:
-        retry = 0
-        maxretries = args.retries
-        retrydelay = 20
-        check = None
-        while retry < maxretries:
-            try:
-                check = checkAPI(api=api, session=session)
-                break
-            except requests.exceptions.ConnectionError as e:
-                print 'Connection error: %s'%(str(e))
-                retry += 1
-                print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
-                time.sleep(retrydelay)
+        check, checkedapi = checkRetryAPI(api, args.retries, args.xmlrevisions, session)
+
    if api and check:
+        # Replace the index URL we got from the API check
        index2 = check[1]
-        api = check[2]
-        print 'API is OK: ' + api
+        api = checkedapi
+        print 'API is OK: ' + checkedapi
    else:
        if index and not args.wiki:
            print 'API not available. Trying with index.php only.'
@ -1867,6 +1877,42 @@ def getParameters(params=[]):
    return config, other


+def checkRetryAPI(api=None, retries=5, apiclient=False, session=None):
+    """ Call checkAPI and mwclient if necessary """
+    retry = 0
+    retrydelay = 20
+    check = None
+    while retry < retries:
+        try:
+            check = checkAPI(api, session=session)
+            break
+        except requests.exceptions.ConnectionError as e:
+            print 'Connection error: %s'%(str(e))
+            retry += 1
+            print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
+            time.sleep(retrydelay)
+
+    if check and apiclient:
+        apiurl = urlparse(api)
+        try:
+            site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme)
+        except KeyError:
+            # Probably KeyError: 'query'
+            if apiurl.scheme == "https":
+                newscheme = "http"
+                api = api.replace("https://", "http://")
+            else:
+                newscheme = "https"
+                api = api.replace("http://", "https://")
+            print("WARNING: The provided API URL did not work with mwclient. Switched protocol to: {}".format(newscheme))
+
+            try:
+                site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=newscheme)
+            except KeyError:
+                check = False
+
+    return check, api
+
 def checkAPI(api=None, session=None):
    """ Checking API availability """
    global cj
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 argparse>=1.2.1
-requests>=2.3.0
 internetarchive
 kitchen
+mwclient
+requests>=2.3.0