Actually allow resuming huge or broken XML dumps

* Log "XML export on this wiki is broken, quitting." to the error file so that grepping reveals which dumps were interrupted so. * Automatically reduce export size for a page when downloading the entire history at once results in a MemoryError. * Truncate the file with a pythonic method (.seek and .truncate) while reading from the end, by making reverse_readline() a weird hybrid to avoid an actual coroutine.
9 years ago · d4fd745498
parent 9168a66a54
commit d4fd745498
1 changed files with 26 additions and 20 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -444,6 +444,7 @@ def getXMLHeader(config={}, session=None):
    header = xml.split('</mediawiki>')[0]
    if not re.match("<mediawiki", xml):
        print 'XML export on this wiki is broken, quitting.'
+        logerror(u'XML export on this wiki is broken, quitting.')
        sys.exit()
    return header, config

@ -615,7 +616,12 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
                    # offset is OK in this wiki, merge with the previous chunk
                    # of this page history and continue
                    xml2 = xml2.split("</page>")[0]
-                    yield '  <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    try:
+                        yield '  <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    except MemoryError:
+                        print "The page's history exceeds our memory, halving limit."
+                        params['limit'] = params['limit'] / 2
+                        continue
                    xml = xml2
                    numberofedits += len(re.findall(r_timestamp, xml))
            else:
@ -652,21 +658,9 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
    xmlfile = ''
    lock = True
    if start:
-        print "Removing the last chunk of past XML dump: it is probably incomplete"
-        xmlfile = reverse_readline('%s/%s' % (config['path'], xmlfilename))
-        c = 0
-        for l in xmlfile:
-            c += 1
-            if re.search(r'<title>%s</title>' % (start), l):
-                # Done searching. We try to truncate the file at this point:
-                # everything should be removed from the line before <title>,
-                # that is the last c+1 lines AKA lines from EOF - c to EOF.
-                # TODO: do something for users without GNU ed; replace os.
-                # Try file.seek and file.truncate in the generator again?
-                os.system("(echo '$-%d,$d'; echo wq ) | ed %s/%s" \
-                    % (c, config['path'], xmlfilename) )
-                print "Last %d lines removed." % (c+1)
-                break
+        print "Removing the last chunk of past XML dump: it is probably incomplete."
+        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
+            pass
    else:
        # requested complete xml dump
        lock = False
@ -728,11 +722,11 @@ def readTitles(config={}, start=None):
            else:
                yield line.strip()

-def reverse_readline(filename, buf_size=8192):
+def reverse_readline(filename, buf_size=8192, truncate=False):
    """a generator that returns the lines of a file in reverse order"""
    # Original code by srohde, abdus_salam: cc by-sa 3.0
    # http://stackoverflow.com/a/23646049/718903
-    with open(filename) as fh:
+    with open(filename, 'r+') as fh:
        segment = None
        offset = 0
        fh.seek(0, os.SEEK_END)
@ -753,10 +747,22 @@ def reverse_readline(filename, buf_size=8192):
                if buffer[-1] is not '\n':
                    lines[-1] += segment
                else:
-                    yield segment
+                    if truncate and '</page>' in segment:
+                        pages = buffer.split('</page>')
+                        fh.seek(-offset+buf_size-len(pages[-1]), os.SEEK_END)
+                        fh.truncate
+                        raise StopIteration
+                    else:
+                        yield segment
            segment = lines[0]
            for index in range(len(lines) - 1, 0, -1):
-                yield lines[index]
+                if truncate and '</page>' in segment:
+                    pages = buffer.split('</page>')
+                    fh.seek(-offset-len(pages[-1]), os.SEEK_END)
+                    fh.truncate
+                    raise StopIteration
+                else:
+                    yield lines[index]
        yield segment

 def saveImageNames(config={}, images=[], session=None):