First attempt of API-only export

6 years ago · a8cbb357ff
parent 142b48cc69
commit a8cbb357ff
1 changed files with 98 additions and 39 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -39,11 +39,16 @@ except ImportError:             # Python 2.4 compatibility
    from md5 import new as md5
 import os
 import re
+import subprocess
 try:
    import requests
 except ImportError:
    print "Please install or update the Requests module."
    sys.exit(1)
+try:
+    import wikitools
+except ImportError:
+    print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
 import time
 import urllib
 UTF8Writer = getwriter('utf8')
@ -514,7 +519,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
        if c > 0 and c < maxretries:
            wait = increment * c < maxseconds and increment * \
                c or maxseconds  # incremental until maxseconds
-            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait)
+            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
            time.sleep(wait)
            # reducing server load requesting smallest chunks (if curonly then
            # limit = 1 from mother function)
@ -677,10 +682,9 @@ def cleanXML(xml=''):


 def generateXMLDump(config={}, titles=[], start=None, session=None):
-    """ Generates a XML dump for a list of titles """
+    """ Generates a XML dump for a list of titles or from revision IDs """
    # TODO: titles is now unused.

-    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
    header, config = getXMLHeader(config=config, session=session)
    footer = '</mediawiki>\n'  # new line at the end
    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
@ -688,48 +692,100 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
                                    config['curonly'] and 'current' or 'history')
    xmlfile = ''
    lock = True
-    if start:
-        print "Removing the last chunk of past XML dump: it is probably incomplete."
-        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
-            pass
-    else:
-        # requested complete xml dump
-        lock = False
+
+    if config['xmlrevisions']:
+        print 'Retrieving the XML for every page from the beginning'
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
        xmlfile.write(header.encode('utf-8'))
-        xmlfile.close()
-
-    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
-    c = 1
-    for title in readTitles(config, start):
-        if not title.strip():
-            continue
-        if title == start:  # start downloading from start, included
-            lock = False
-        if lock:
-            continue
-        delay(config=config, session=session)
-        if c % 10 == 0:
-            print 'Downloaded %d pages' % (c)
        try:
-            for xml in getXMLPage(config=config, title=title, session=session):
+            for xml in getXMLRevisions(config=config, session=session):
                xml = cleanXML(xml=xml)
                xmlfile.write(xml.encode('utf-8'))
-        except PageMissingError:
-            logerror(
-                config=config,
-                text=u'The page "%s" was missing in the wiki (probably deleted)' %
-                (title.decode('utf-8'))
-            )
-        # here, XML is a correct <page> </page> chunk or
-        # an empty string due to a deleted page (logged in errors log) or
-        # an empty string due to an error while retrieving the page from server
-        # (logged in errors log)
-        c += 1
+        except AttributeError:
+            print "This wikitools module version is not working"
+            sys.exit()
+    else:
+        print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
+        if start:
+            print "Removing the last chunk of past XML dump: it is probably incomplete."
+            for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
+                pass
+        else:
+            # requested complete xml dump
+            lock = False
+            xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
+            xmlfile.write(header.encode('utf-8'))
+            xmlfile.close()
+
+        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
+        c = 1
+        for title in readTitles(config, start):
+            if not title.strip():
+                continue
+            if title == start:  # start downloading from start, included
+                lock = False
+            if lock:
+                continue
+            delay(config=config, session=session)
+            if c % 10 == 0:
+                print 'Downloaded %d pages' % (c)
+            try:
+                for xml in getXMLPage(config=config, title=title, session=session):
+                    xml = cleanXML(xml=xml)
+                    xmlfile.write(xml.encode('utf-8'))
+            except PageMissingError:
+                logerror(
+                    config=config,
+                    text=u'The page "%s" was missing in the wiki (probably deleted)' %
+                    (title.decode('utf-8'))
+                )
+            # here, XML is a correct <page> </page> chunk or
+            # an empty string due to a deleted page (logged in errors log) or
+            # an empty string due to an error while retrieving the page from server
+            # (logged in errors log)
+            c += 1
+
    xmlfile.write(footer)
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename

+def getXMLRevisions(config={}, session=None):
+    site = wikitools.wiki.Wiki(config['api'])
+    if config['namespaces']:
+        namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
+    else:
+        namespaces = ['*']
+
+    for namespace in namespaces:
+        params = {
+            'action': 'query',
+            'generator': 'allrevisions',
+            'garvnamespace': namespace,
+            'garvlimit': 50,
+            'garvprop': 'ids',
+            'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
+            }
+        request = wikitools.api.APIRequest(site, params)
+        results = request.queryGen()
+        try:
+            for result in results:
+                yield result['query']['export']['*']
+        except wikitools.api.APIError:
+            # Falling back to allpages generator, the wiki is too old
+            params = {
+                'action': 'query',
+                'generator': 'allpages',
+                'gaplimit': 50,
+                'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
+            }
+            # allpages does not accept "*"
+            if namespace is not '*':
+                params['gapnamespace'] = namespace
+            request = wikitools.api.APIRequest(site, params)
+            results = request.queryGen()
+            for result in results:
+                yield result['query']['export']['*']
+
 def readTitles(config={}, start=None):
    """ Read title list from a file, from the title "start" """

@ -1303,7 +1359,9 @@ def getParameters(params=[]):
        action='store_true',
        help="generates a full history XML dump (--xml --curonly for current revisions only)")
    groupDownload.add_argument('--curonly', action='store_true',
-                               help='store only the current version of pages')
+        help='store only the current version of pages; incompatible with --xmlrevisions')
+    groupDownload.add_argument('--xmlrevisions', action='store_true',
+                               help='download all revisions from an API generator')
    groupDownload.add_argument(
        '--images', action='store_true', help="generates an image dump")
    groupDownload.add_argument(
@ -1502,6 +1560,7 @@ def getParameters(params=[]):
        'images': args.images,
        'logs': False,
        'xml': args.xml,
+        'xmlrevisions': args.xmlrevisions,
        'namespaces': namespaces,
        'exnamespaces': exnamespaces,
        'path': args.path and os.path.normpath(args.path) or '',
@ -1547,7 +1606,7 @@ def checkAPI(api=None, session=None):
    try:
        result = getJSON(r)
        index = None
-        if result['query']:
+        if result:
            try:
                index = result['query']['general']['server'] + \
                    result['query']['general']['script']
@ -1886,7 +1945,7 @@ def saveSiteInfo(config={}, session=None):
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'general|namespaces',
-                        'format': 'json'}
+                        'format': 'json'},
                    timeout=10)
            result = getJSON(r)
            delay(config=config, session=session)