Merge c3a1c26f5e into ea5e130517

2 years ago · e80f4a41b3
parent ea5e130517 c3a1c26f5e
commit e80f4a41b3
1 changed files with 255 additions and 16 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -23,10 +23,21 @@ try:
    from kitchen.text.converters import getwriter, to_unicode
 except ImportError:
    print "Please install the kitchen module."
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+import xml.dom.minidom as MD
+
 import cookielib
 import cPickle
 import datetime
 import sys
+import io
+import traceback
+
 try:
    import argparse
 except ImportError:
@ -63,7 +74,7 @@ except ImportError:
 UTF8Writer = getwriter('utf8')
 sys.stdout = UTF8Writer(sys.stdout)

-__VERSION__ = '0.4.0-alpha'  # major, minor, micro: semver.org
+__VERSION__ = '0.4.1-alpha'  # major, minor, micro: semver.org

 class PageMissingError(Exception):
    def __init__(self, title, xml):
@ -164,7 +175,7 @@ def getNamespacesScraper(config={}, session=None):
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        r = session.post(
-            url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
+            url=config['index'], params={'title': 'Special:Allpages'}, timeout=60)
        raw = r.text
        delay(config=config, session=session)

@ -206,7 +217,7 @@ def getNamespacesAPI(config={}, session=None):
                'meta': 'siteinfo',
                'siprop': 'namespaces',
                'format': 'json'},
-            timeout=30
+            timeout=60
        )
        result = getJSON(r)
        delay(config=config, session=session)
@ -281,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None):
        print '    Retrieving titles in the namespace', namespace
        url = '%s?title=Special:Allpages&namespace=%s' % (
            config['index'], namespace)
-        r = session.get(url=url, timeout=30)
+        r = session.get(url=url, timeout=60)
        raw = r.text
        raw = cleanHTML(raw)

@ -455,7 +466,7 @@ def getXMLHeader(config={}, session=None):

    else:
        try:
-            xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+            xml = "".join([x for x in selectXMLQueryMode(config=config, title=randomtitle, verbose=False, session=session)])
        except PageMissingError as pme:
            # The <page> does not exist. Not a problem, if we get the <siteinfo>.
            xml = pme.xml
@ -473,11 +484,11 @@ def getXMLHeader(config={}, session=None):
                        'meta': 'siteinfo',
                        'siprop': 'namespaces',
                        'format': 'json'},
-                    timeout=120
+                    timeout=60
                    )
                    config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
                        + ':Export'
-                    xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+                    xml = "".join([x for x in selectXMLQueryMode(config=config, title=randomtitle, verbose=False, session=session)])
            except PageMissingError as pme:
                xml = pme.xml
            except ExportAbortedError:
@ -500,7 +511,7 @@ def getXMLHeader(config={}, session=None):
 def getXMLFileDesc(config={}, title='', session=None):
    """ Get XML for image description page """
    config['curonly'] = 1  # tricky to get only the most recent desc
-    return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
+    return("".join([x for x in selectXMLQueryMode( config=config, title=title, verbose=False, session=session)]))


 def getUserAgent():
@ -521,7 +532,216 @@ def logerror(config={}, text=''):
            output = u'%s: %s\n' % (
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
            outfile.write(output.encode('utf-8'))
+def reconstructRevisions(root=None):
+    #print ET.tostring(rev)
+    page = ET.Element('stub')
+    edits = 0
+    for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
+        try:
+            rev_ = ET.SubElement(page,'revision')
+            ET.SubElement(rev_,'id').text = rev.attrib['revid']
+            ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
+            contributor = ET.SubElement(rev_,'contributor')
+            if not rev.attrib.has_key('userhidden'):
+                ET.SubElement(contributor,'username').text = rev.attrib['user']
+                ET.SubElement(contributor,'id').text = rev.attrib['userid']
+            else:
+                contributor.set('deleted','deleted')
+            comment = ET.SubElement(rev_,'comment')
+            if not rev.attrib.has_key('commenthidden'):
+                comment.text = rev.attrib['comment']
+            else:
+                comment.set('deleted','deleted')
+
+            # some revision does not return model and format, so just use hard-code
+            ET.SubElement(rev_,'model').text = 'wikitext'
+            ET.SubElement(rev_,'format').text = 'text/x-wiki'
+            text = ET.SubElement(rev_,'text')
+            if not rev.attrib.has_key('texthidden'):
+                text.attrib['xml:space'] = "preserve"
+                text.attrib['bytes'] = rev.attrib['size']
+                text.text = rev.text
+            else:
+                text.set('deleted','deleted')
+            # delete sha1 here :)
+            #sha1 = ET.SubElement(rev_,'sha1')
+            #if not rev.attrib.has_key('sha1missing'):
+                #sha1.text = rev.attrib['sha1']
+            if rev.attrib.has_key('minor'):
+                ET.SubElement(rev_,'minor')
+            edits += 1
+        except Exception as e:
+            #logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
+            print ET.tostring(rev)
+            traceback.print_exc()
+            page = None
+            edits = 0
+            raise e
+    return page,edits
+
+def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
+    """  """
+    # just send the API request
+    # if it fails, it will reduce params['rvlimit']
+    xml = ''
+    c = 0
+    maxseconds = 100  # max seconds to wait in a single sleeping
+    maxretries = config['retries']  # x retries and skip
+    increment = 20  # increment every retry

+    while not re.search(r'</api>' if not config['curonly'] else r'</mediawiki>', xml) or re.search(r'</error>', xml):
+        if c > 0 and c < maxretries:
+            wait = increment * c < maxseconds and increment * \
+                c or maxseconds  # incremental until maxseconds
+            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiquery'] else 'pages'], wait)
+            time.sleep(wait)
+            # reducing server load requesting smallest chunks (if curonly then
+            # rvlimit = 1 from mother function)
+            if params['rvlimit'] > 1:
+                params['rvlimit'] = params['rvlimit'] / 2  # half
+        if c >= maxretries:
+            print '    We have retried %d times' % (c)
+            print '    MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiquery'] else 'pages'])
+            # If it's not already what we tried: our last chance, preserve only the last revision...
+            # config['curonly'] means that the whole dump is configured to save only the last,
+            # params['curonly'] should mean that we've already tried this
+            # fallback, because it's set by the following if and passed to
+            # getXMLPageCore
+            # TODO: save only the last version when failed
+            print '    Saving in the errors log, and skipping...'
+            logerror(
+                config=config,
+                text=u'Error while retrieving the last revision of "%s". Skipping.' %
+                (params['titles' if config['apiquery'] else 'pages']).decode('utf-8'))
+            #raise ExportAbortedError(config['index'])
+            return ''  # empty xml
+
+        # FIXME HANDLE HTTP Errors HERE
+        try:
+            r = session.get(url=config['api'], params=params, headers=headers)
+            handleStatusCode(r)
+            xml = fixBOM(r)
+            #print xml
+        except requests.exceptions.ConnectionError as e:
+            print '    Connection error: %s'%(str(e[0]))
+            xml = ''
+        c += 1
+    return xml
+def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
+    """ Get the full history (or current only) of a page using API:Query
+        if params['curonly'] is set, then using export&exportwrap to export
+    """
+
+    title_ = title
+    title_ = re.sub(' ', '_', title_)
+    # do not convert & into %26, title_ = re.sub('&', '%26', title_)
+    # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
+    # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
+    #print 'current:%s' % (title_)
+    if not config['curonly']:
+        params = {'titles': title_, 'action': 'query','format':'xml',
+            'prop':'revisions',
+            'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
+            'rvcontinue' : None,
+            'rvlimit' : 10 # TODO: set this by commandline
+        }
+    else:
+        params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
+    #print 'params:%s' % (params)
+    if not config['curonly']:
+        firstpartok = False
+        lastcontinue = None
+        numberofedits = 0
+        ret = ''
+        while True:
+            # in case the last request is not right, saving last time's progress
+            if not firstpartok:
+                try:
+                    lastcontinue = params['rvcontinue']
+                except:
+                    lastcontinue = None
+
+            xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+            if xml == "":
+                #just return so that we can continue, and getXMLPageCoreWithApi will log the error
+                return
+            try:
+                root = ET.fromstring(xml.encode('utf-8'))
+            except:
+                continue
+            try:
+                retpage = root.find('query').find('pages').find('page')
+            except:
+                continue
+            if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
+                print 'Page not found'
+                raise PageMissingError(params['titles'], xml)
+            if not firstpartok:
+                try:
+                    # build the firstpart by ourselves to improve the memory usage
+                    ret  = '  <page>\n'
+                    ret += '    <title>%s</title>\n' %(retpage.attrib['title'])
+                    ret += '    <ns>%s</ns>\n' % (retpage.attrib['ns'])
+                    ret += '    <id>%s</id>\n' % (retpage.attrib['pageid'])
+                except:
+                    firstpartok = False
+                    continue
+                else:
+                    firstpartok = True
+                    yield ret
+            try:
+                ret = ''
+                edits = 0
+                if config['curonly'] or root.find('continue') == None:
+                    # transform the revision
+                    rev_,edits = reconstructRevisions(root=root)
+                    xmldom = MD.parseString('<stub1>'+ET.tostring(rev_)+'</stub1>')
+                    # convert it into text in case it throws MemoryError
+                    # delete the first three line and last two line,which is for setting the indent
+                    ret += ''.join(xmldom.toprettyxml(indent='  ').splitlines(True)[3:-2])
+                    yield ret
+                    numberofedits += edits
+                    break
+                else:
+                    rev_,edits = reconstructRevisions(root=root)
+                    xmldom = MD.parseString('<stub1>' + ET.tostring(rev_) + '</stub1>')
+                    ret += ''.join(xmldom.toprettyxml(indent='  ').splitlines(True)[3:-2])
+                    params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
+                    numberofedits += edits
+                    yield ret
+            except:
+                traceback.print_exc()
+                params['rvcontinue'] = lastcontinue
+                ret = ''
+        yield '  </page>\n'
+    else:
+        xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+        if xml == "":
+            raise ExportAbortedError(config['index'])
+        if not "</page>" in xml:
+            raise PageMissingError(params['titles'], xml)
+        else:
+            # strip these sha1s sums which keep showing up in the export and
+            # which are invalid for the XML schema (they only apply to
+            # revisions)
+            xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
+            xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
+
+        yield xml.split("</page>")[0]
+
+        # just for looking good :)
+        r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+
+        numberofedits = 0
+        numberofedits += len(re.findall(r_timestamp, xml))
+
+        yield "</page>\n"
+
+    if verbose:
+        if (numberofedits == 1):
+           print '    %s, 1 edit' % (title.strip())
+        else:
+           print '    %s, %d edits' % (title.strip(), numberofedits)

 def getXMLPageCore(headers={}, params={}, config={}, session=None):
    """  """
@ -587,7 +807,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
            xml = ''
        except requests.exceptions.ReadTimeout as e:
            print '    Read timeout: %s'%(str(e[0]))
-            xml = ''       
+            xml = ''
        c += 1

    return xml
@ -694,7 +914,14 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
           print '    %s, 1 edit' % (title.strip())
        else:
           print '    %s, %d edits' % (title.strip(), numberofedits)
-
+def selectXMLQueryMode(config={}, title='', verbose=True, session=None):
+    if config['apiquery']:
+        #Using api.php?Query instead of relying on Special:Export
+        return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
+    else:
+        #Using the traditional method(default)
+        return getXMLPage(config=config, title=title, verbose=verbose, session=session)
+    return ''

 def makeXmlPageFromRaw(xml):
    """ Discard the metadata around a <page> element in <mediawiki> string"""
@ -775,7 +1002,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
            if c % 10 == 0:
                print 'Downloaded %d pages' % (c)
            try:
-                for xml in getXMLPage(config=config, title=title, session=session):
+                for xml in selectXMLQueryMode(config=config, title=title, session=session):
                    xml = cleanXML(xml=xml)
                    xmlfile.write(xml.encode('utf-8'))
            except PageMissingError:
@ -902,7 +1129,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                        # repeated header is confusing and would not even be valid
                        xml = exportrequest['query']['export']['*']
                        yield makeXmlPageFromRaw(xml)
-                        
+
                    if 'continue' in arvrequest:
                        # Get the new ones
                        arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
@ -924,7 +1151,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                    else:
                        # End of continuation. We are done with this namespace.
                        break
-                    
+
    except (KeyError, mwclient.errors.InvalidResponse) as e:
        print(e)
        # TODO: check whether the KeyError was really for a missing arv API
@ -1156,7 +1383,7 @@ def reverse_readline(filename, buf_size=8192, truncate=False):
            if segment is not None:
                # if the previous chunk starts right from the beginning of line
                # do not concat the segment to the last line of new chunk
-                # instead, yield the segment first 
+                # instead, yield the segment first
                if buffer[-1] is not '\n':
                    lines[-1] += segment
                else:
@ -1684,6 +1911,7 @@ def getParameters(params=[]):
        action='store_true',
        help='resumes previous incomplete dump (requires --path)')
    parser.add_argument('--force', action='store_true', help='')
+    parser.add_argument('--ignore-api-check', action='store_true', help='')
    parser.add_argument(
        '--user', help='Username if authentication is required.')
    parser.add_argument(
@ -1715,6 +1943,10 @@ def getParameters(params=[]):
        help="generates a full history XML dump (--xml --curonly for current revisions only)")
    groupDownload.add_argument('--curonly', action='store_true',
        help='store only the current version of pages')
+    groupDownload.add_argument(
+        '--apiquery',
+        action='store_true',
+        help="EXPERIMENTAL: Using api.php?query instead of Special:Export to export pages, works with: --curonly,--xmlrevisions,--images")
    groupDownload.add_argument('--xmlrevisions', action='store_true',
                               help='download all revisions from an API generator. MediaWiki 1.27+ only.')
    groupDownload.add_argument(
@ -1764,6 +1996,10 @@ def getParameters(params=[]):
            print getWikiEngine(url=args.wiki)
            sys.exit()

+    if (args.apiquery and not args.curonly) and (args.apiquery and not args.xmlrevisions) and (args.apiquery and not args.images):
+        print('ERROR: --apiquery requires either --curonly or --images or --xmlrevisions')
+        sys.exit()
+
    # Create session
    cj = cookielib.MozillaCookieJar()
    if args.cookies:
@ -1828,6 +2064,8 @@ def getParameters(params=[]):
        index2 = check[1]
        api = checkedapi
        print 'API is OK: ' + checkedapi
+    elif args.ignore_api_check:
+        print 'Error in API. Ignoring.'
    else:
        if index and not args.wiki:
            print 'API not available. Trying with index.php only.'
@ -1925,6 +2163,7 @@ def getParameters(params=[]):
        'cookies': args.cookies or '',
        'delay': args.delay,
        'retries': int(args.retries),
+        'apiquery': args.apiquery,
    }

    other = {
@ -2385,7 +2624,7 @@ def getWikiEngine(url=''):
    session.headers.update({'User-Agent': getUserAgent()})
    r = session.post(url=url, timeout=30)
    if r.status_code == 405 or r.text == '':
-        r = session.get(url=url, timeout=120)
+        r = session.get(url=url, timeout=60)
    result = r.text

    wikiengine = 'Unknown'
@ -2468,7 +2707,7 @@ def mwGetAPIAndIndex(url=''):
    index = ''
    session = requests.Session()
    session.headers.update({'User-Agent': getUserAgent()})
-    r = session.post(url=url, timeout=120)
+    r = session.post(url=url, timeout=60)
    result = r.text

    # API