Merge pull request #216 from makoshark/master

Issue #8: avoid MemoryError fatal on big histories, remove sha1 for Wikia
9 years ago · ac72938d40
parent 28fc715b28 eb8b44aef0
commit ac72938d40
1 changed files with 35 additions and 22 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -45,6 +45,8 @@ import urllib

 __VERSION__ = '0.3.0-alpha'  # major, minor, micro: semver.org

+class PageMissingError(Exception):
+    pass

 def getVersion():
    return(__VERSION__)
@ -387,10 +389,10 @@ def getXMLHeader(config={}, session=None):
    # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
    # xmlns:x....
    randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
-    xml = getXMLPage(
-        config=config, title=randomtitle, verbose=False, session=session)
-    header = xml.split('</mediawiki>')[0]
-    if not xml:
+    try:
+        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+        header = xml.split('</mediawiki>')[0]
+    except PageMissingError:
        print 'XML export on this wiki is broken, quitting.'
        sys.exit()
    return header
@ -399,12 +401,7 @@ def getXMLHeader(config={}, session=None):
 def getXMLFileDesc(config={}, title='', session=None):
    """ Get XML for image description page """
    config['curonly'] = 1  # tricky to get only the most recent desc
-    return getXMLPage(
-        config=config,
-        title=title,
-        verbose=False,
-        session=session
-    )
+    return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))


 def getUserAgent():
@ -510,10 +507,24 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
        params['templates'] = 1

    xml = getXMLPageCore(params=params, config=config, session=session)
+    if not xml:
+        raise PageMissingError
+    else:
+        # strip these sha1s sums which keep showing up in the export and
+        # which are invalid for the XML schema (they only apply to
+        # revisions)
+        xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
+        xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
+
+    yield xml.split("</page>")[0]

    # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
    # else, warning about Special:Export truncating large page histories
    r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+
+    numberofedits = 0
+    numberofedits += len(re.findall(r_timestamp, xml))
+
    # search for timestamps in xml to avoid analysing empty pages like
    # Special:Allpages and the random one
    if not config['curonly'] and re.search(r_timestamp, xml):
@ -546,26 +557,27 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
                    """
                    # offset is OK in this wiki, merge with the previous chunk
                    # of this page history and continue
-                    xml = xml.split(
-                        '</page>')[0] + '    <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    xml2 = xml2.split("</page>")[0]
+                    yield '  <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    xml = xml2
+                    numberofedits += len(re.findall(r_timestamp, xml))
            else:
                params['offset'] = ''  # no more edits in this page history
+    yield "</page>\n"

    if verbose:
-        numberofedits = len(re.findall(r_timestamp, xml))
        if (numberofedits == 1):
-            print '    %s, 1 edit' % (title.encode('utf-8'))
+           print '    %s, 1 edit' % (title.encode('utf-8'))
        else:
-            print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)
-
-    return xml
+           print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)


 def cleanXML(xml=''):
    """ Trim redundant info """
    # do not touch XML codification, leave AS IS
-    if re.search(r'</siteinfo>\n', xml) and re.search(r'</mediawiki>', xml):
+    if re.search(r'</siteinfo>\n', xml):
        xml = xml.split('</siteinfo>\n')[1]
+    if re.search(r'</mediawiki>', xml):
        xml = xml.split('</mediawiki>')[0]
    return xml

@ -627,9 +639,11 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
        delay(config=config, session=session)
        if c % 10 == 0:
            print 'Downloaded %d pages' % (c)
-        xml = getXMLPage(config=config, title=title, session=session)
-        xml = cleanXML(xml=xml)
-        if not xml:
+        try:
+            for xml in getXMLPage(config=config, title=title, session=session):
+                xml = cleanXML(xml=xml)
+                xmlfile.write(xml.encode('utf-8'))
+        except PageMissingError:
            logerror(
                config=config,
                text=u'The page "%s" was missing in the wiki (probably deleted)' %
@ -639,7 +653,6 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
        # an empty string due to a deleted page (logged in errors log) or
        # an empty string due to an error while retrieving the page from server
        # (logged in errors log)
-        xmlfile.write(xml.encode('utf-8'))
        c += 1
    xmlfile.write(footer)
    xmlfile.close()