Resume and list titles without keeping everything in memory

Approach suggested by @makoshark, finally found the time to start implementing it. * Do not produce and save the titles list all at once. Instead, use the scraper and API as generators and save titles on the go. Also, try to start the generator from the appropriate title. For now the title sorting is not implemented. Pages will be in the order given by namespace ID, then page name. * When resuming, read both the title list and the XML file from the end rather than the beginning. If the correct terminator is present, only one line needs to be read. * In both cases, use a generator instead of a huge list in memory. * Also truncate the resumed XML without writing it from scratch. For now using GNU ed: very compact, though shelling out is ugly. I gave up on using file.seek and file.truncate to avoid reading the whole file from the beginning or complicating reverse_readline() with more offset calculations. This should avoid MemoryError in most cases. Tested by running a dump over a 1.24 wiki with 11 pages: a complete dump and a resumed dump from a dump interrupted with ctrl-c.
9 years ago · 14ce5f2c1b
parent bdc7c9bf06
commit 14ce5f2c1b
1 changed files with 111 additions and 76 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -257,19 +257,19 @@ def getPageTitlesAPI(config={}, session=None):
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
-            titles += [page['title']
+            for page in allpages:
-                       for page in allpages]
+                yield page['title']
            c += len(allpages)
            if len(titles) != len(set(titles)):
                # probably we are in a loop, server returning dupe titles, stop
                # it
                print 'Probably a loop, finishing'
                titles = list(set(titles))
                apfrom = ''
-            c += len(allpages)
+
            delay(config=config, session=session)
        print '    %d titles retrieved in the namespace %d' % (c, namespace)
    return titles
 def getPageTitlesScraper(config={}, session=None):
    """  """
@ -368,13 +368,21 @@ def getPageTitles(config={}, session=None):
    elif 'index' in config and config['index']:
        titles = getPageTitlesScraper(config=config, session=session)
-    # removing dupes (e.g. in CZ appears Widget:AddThis two times (main
+    titlesfilename = '%s-%s-titles.txt' % (
-    # namespace and widget namespace))
+        domain2prefix(config=config), config['date'])
-    titles = sorted(set(titles))
+    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'a')
-
+    c = 0
-    print '%d page titles loaded' % (len(titles))
+    for title in titles:
-    return titles
+        titlesfile.write(title.encode('utf-8') + "\n")
        c += 1
    # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
    # main namespace and widget namespace.
    # We can use sort -u in UNIX, but is it worth it?
    titlesfile.write(u'--END--\n')
    titlesfile.close()
    print 'Titles saved at...', titlesfilename
    print '%d page titles loaded' % (c)
 def getImageNames(config={}, session=None):
    """ Get list of image names """
@ -610,9 +618,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
    if verbose:
        if (numberofedits == 1):
-           print '    %s, 1 edit' % (title.encode('utf-8'))
+           print '    %s, 1 edit' % (title.encode('utf-8').strip())
        else:
-           print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)
+           print '    %s, %d edits' % (title.encode('utf-8').strip(), numberofedits)
 def cleanXML(xml=''):
@ -625,8 +633,9 @@ def cleanXML(xml=''):
    return xml
-def generateXMLDump(config={}, titles=[], start='', session=None):
+def generateXMLDump(config={}, titles=[], start=None, session=None):
    """ Generates a XML dump for a list of titles """
    # TODO: titles is now unused.
    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
    header, config = getXMLHeader(config=config, session=session)
@ -637,32 +646,21 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
    xmlfile = ''
    lock = True
    if start:
-        # remove the last chunk of xml dump (it is probably incomplete)
+        print "Removing the last chunk of past XML dump: it is probably incomplete"
-        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r')
+        xmlfile = reverse_readline('%s/%s' % (config['path'], xmlfilename))
        xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w')
        prev = ''
        c = 0
        for l in xmlfile:
            # removing <page>\n until end of file
            # lock to avoid write an empty line at the begining of file
            if c != 0:
                if not re.search(r'<title>%s</title>' % (start), l):
                    xmlfile2.write(prev)
                else:
                    break
            c += 1
-            prev = l
+            if re.search(r'<title>%s</title>' % (start), l):
-        xmlfile.close()
+                # Done searching. We try to truncate the file at this point:
-        xmlfile2.close()
+                # everything should be removed from the line before <title>,
-        # subst xml with xml2
+                # that is the last c+1 lines AKA lines from EOF - c to EOF.
-        # remove previous xml dump
+                # TODO: do something for users without GNU ed; replace os.
-        os.remove('%s/%s' % (config['path'], xmlfilename))
+                # Try file.seek and file.truncate in the generator again?
-        # move correctly truncated dump to its real name
+                os.system("(echo '$-%d,$d'; echo wq ) | ed %s/%s" \
-        os.rename(
+                    % (c, config['path'], xmlfilename) )
-            '%s/%s2' %
+                print "Last %d lines removed." % (c+1)
-            (config['path'], xmlfilename), '%s/%s' %
+                break
            (config['path'], xmlfilename)
        )
    else:
        # requested complete xml dump
        lock = False
@ -672,7 +670,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
-    for title in titles:
+    for title in readTitles(config, start):
        if not title.strip():
            continue
        if title == start:  # start downloading from start, included
@ -701,19 +699,59 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename
-
+def readTitles(config={}, start=None):
-def saveTitles(config={}, titles=[]):
+    """ Read title list from a file, from the title "start" """
    """ Save title list in a file """
    titlesfilename = '%s-%s-titles.txt' % (
        domain2prefix(config=config), config['date'])
-    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
+    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')
    output = u"%s\n--END--" % ('\n'.join(titles))
    titlesfile.write(output.encode('utf-8'))
    titlesfile.close()
-    print 'Titles saved at...', titlesfilename
+    seeking = False
    if start:
        seeking = True
    with titlesfile as f:
        for line in f:
            if line.strip() == '--END--':
                break
            elif seeking and line.strip() != start:
                continue
            elif seeking and line.strip() == start:
                seeking = False
                yield line.strip()
            else:
                yield line.strip()
 def reverse_readline(filename, buf_size=8192):
    """a generator that returns the lines of a file in reverse order"""
    # Original code by srohde, abdus_salam: cc by-sa 3.0
    # http://stackoverflow.com/a/23646049/718903
    with open(filename) as fh:
        segment = None
        offset = 0
        fh.seek(0, os.SEEK_END)
        total_size = remaining_size = fh.tell()
        while remaining_size > 0:
            offset = min(total_size, offset + buf_size)
            fh.seek(-offset, os.SEEK_END)
            buffer = fh.read(min(remaining_size, buf_size))
            remaining_size -= buf_size
            lines = buffer.split('\n')
            # the first line of the buffer is probably not a complete line so
            # we'll save it and append it to the last line of the next buffer
            # we read
            if segment is not None:
                # if the previous chunk starts right from the beginning of line
                # do not concat the segment to the last line of new chunk
                # instead, yield the segment first 
                if buffer[-1] is not '\n':
                    lines[-1] += segment
                else:
                    yield segment
            segment = lines[0]
            for index in range(len(lines) - 1, 0, -1):
                yield lines[index]
        yield segment
 def saveImageNames(config={}, images=[], session=None):
    """ Save image list in a file, including filename, url and uploader """
@ -1525,12 +1563,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
 def createNewDump(config={}, other={}):
    titles = []
    images = []
    print 'Trying generating a new dump into a new directory...'
    if config['xml']:
-        titles += getPageTitles(config=config, session=other['session'])
+        getPageTitles(config=config, session=other['session'])
-        saveTitles(config=config, titles=titles)
+        titles=readTitles(config)
        generateXMLDump(config=config, titles=titles, session=other['session'])
        checkXMLIntegrity(
            config=config,
@ -1549,23 +1586,21 @@ def createNewDump(config={}, other={}):
 def resumePreviousDump(config={}, other={}):
    titles = []
    images = []
    print 'Resuming previous dump process...'
    if config['xml']:
-        # load titles
+        titles=readTitles(config)
        lasttitle = ''
        try:
-            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(
+            lasttitles = reverse_readline('%s/%s-%s-titles.txt' %
-                config=config, session=other['session']), config['date']), 'r')
+                ( config['path'],
-            raw = unicode(f.read(), 'utf-8')
+                domain2prefix( config=config, session=other['session'] ),
-            titles = raw.split('\n')
+                config['date'])
-            lasttitle = titles[-1]
+                )
-            if not lasttitle:  # empty line at EOF ?
+            lasttitle=lasttitles.next()
-                lasttitle = titles[-2]
+            if lasttitle == '':
-            f.close()
+                lasttitle=lasttitles.next()
        except:
-            pass  # probably file doesnot exists
+            pass  # probably file does not exists
        if lasttitle == '--END--':
            # titles list is complete
            print 'Title list was completed in the previous session'
@ -1573,13 +1608,13 @@ def resumePreviousDump(config={}, other={}):
            print 'Title list is incomplete. Reloading...'
            # do not resume, reload, to avoid inconsistences, deleted pages or
            # so
-            titles = getPageTitles(config=config, session=other['session'])
+            getPageTitles(config=config, session=other['session'])
-            saveTitles(config=config, titles=titles)
+
        # checking xml dump
        xmliscomplete = False
-        lastxmltitle = ''
+        lastxmltitle = None
        try:
-            f = open(
+            f = reverse_readline(
                '%s/%s-%s-%s.xml' %
                (config['path'],
                 domain2prefix(
@ -1587,27 +1622,26 @@ def resumePreviousDump(config={}, other={}):
                    session=other['session']),
                    config['date'],
                    config['curonly'] and 'current' or 'history'),
-                'r')
+                )
            for l in f:
-                if re.findall('</mediawiki>', l):
+                if l == '</mediawiki>':
                    # xml dump is complete
                    xmliscomplete = True
                    break
-                # weird if found more than 1, but maybe
+
-                xmltitles = re.findall(r'<title>([^<]+)</title>', l)
+                xmltitle = re.search(r'<title>([^<]+)</title>', l)
-                if xmltitles:
+                if xmltitle:
-                    lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
+                    lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
-            f.close()
+                    break
        except:
-            pass  # probably file doesnot exists
+            pass  # probably file does not exists
-        # removing --END-- before getXMLs
+
        while titles and titles[-1] in ['', '--END--']:
            titles = titles[:-1]
        if xmliscomplete:
            print 'XML dump was completed in the previous session'
        elif lastxmltitle:
            # resuming...
            print 'Resuming XML dump from "%s"' % (lastxmltitle)
            titles = readTitles(config, start=lastxmltitle)
            generateXMLDump(
                config=config,
                titles=titles,
@ -1616,6 +1650,7 @@ def resumePreviousDump(config={}, other={}):
        else:
            # corrupt? only has XML header?
            print 'XML is corrupt? Regenerating...'
            titles = readTitles(config)
            generateXMLDump(
                config=config, titles=titles, session=other['session'])