Resume and list titles without keeping everything in memory

Approach suggested by @makoshark, finally found the time to start implementing it. * Do not produce and save the titles list all at once. Instead, use the scraper and API as generators and save titles on the go. Also, try to start the generator from the appropriate title. For now the title sorting is not implemented. Pages will be in the order given by namespace ID, then page name. * When resuming, read both the title list and the XML file from the end rather than the beginning. If the correct terminator is present, only one line needs to be read. * In both cases, use a generator instead of a huge list in memory. * Also truncate the resumed XML without writing it from scratch. For now using GNU ed: very compact, though shelling out is ugly. I gave up on using file.seek and file.truncate to avoid reading the whole file from the beginning or complicating reverse_readline() with more offset calculations. This should avoid MemoryError in most cases. Tested by running a dump over a 1.24 wiki with 11 pages: a complete dump and a resumed dump from a dump interrupted with ctrl-c.
9 years ago · 14ce5f2c1b
parent bdc7c9bf06
commit 14ce5f2c1b
1 changed files with 111 additions and 76 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -257,19 +257,19 @@ def getPageTitlesAPI(config={}, session=None):
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
-            titles += [page['title']
-                       for page in allpages]
+            for page in allpages:
+                yield page['title']
+            c += len(allpages)
+
            if len(titles) != len(set(titles)):
                # probably we are in a loop, server returning dupe titles, stop
                # it
                print 'Probably a loop, finishing'
                titles = list(set(titles))
                apfrom = ''
-            c += len(allpages)
+
            delay(config=config, session=session)
        print '    %d titles retrieved in the namespace %d' % (c, namespace)
-    return titles
-

 def getPageTitlesScraper(config={}, session=None):
    """  """
@ -368,13 +368,21 @@ def getPageTitles(config={}, session=None):
    elif 'index' in config and config['index']:
        titles = getPageTitlesScraper(config=config, session=session)

-    # removing dupes (e.g. in CZ appears Widget:AddThis two times (main
-    # namespace and widget namespace))
-    titles = sorted(set(titles))
-
-    print '%d page titles loaded' % (len(titles))
-    return titles
+    titlesfilename = '%s-%s-titles.txt' % (
+        domain2prefix(config=config), config['date'])
+    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'a')
+    c = 0
+    for title in titles:
+        titlesfile.write(title.encode('utf-8') + "\n")
+        c += 1
+    # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
+    # main namespace and widget namespace.
+    # We can use sort -u in UNIX, but is it worth it?
+    titlesfile.write(u'--END--\n')
+    titlesfile.close()
+    print 'Titles saved at...', titlesfilename

+    print '%d page titles loaded' % (c)

 def getImageNames(config={}, session=None):
    """ Get list of image names """
@ -610,9 +618,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None):

    if verbose:
        if (numberofedits == 1):
-           print '    %s, 1 edit' % (title.encode('utf-8'))
+           print '    %s, 1 edit' % (title.encode('utf-8').strip())
        else:
-           print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)
+           print '    %s, %d edits' % (title.encode('utf-8').strip(), numberofedits)


 def cleanXML(xml=''):
@ -625,8 +633,9 @@ def cleanXML(xml=''):
    return xml


-def generateXMLDump(config={}, titles=[], start='', session=None):
+def generateXMLDump(config={}, titles=[], start=None, session=None):
    """ Generates a XML dump for a list of titles """
+    # TODO: titles is now unused.

    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
    header, config = getXMLHeader(config=config, session=session)
@ -637,32 +646,21 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
    xmlfile = ''
    lock = True
    if start:
-        # remove the last chunk of xml dump (it is probably incomplete)
-        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r')
-        xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w')
-        prev = ''
+        print "Removing the last chunk of past XML dump: it is probably incomplete"
+        xmlfile = reverse_readline('%s/%s' % (config['path'], xmlfilename))
        c = 0
        for l in xmlfile:
-            # removing <page>\n until end of file
-            # lock to avoid write an empty line at the begining of file
-            if c != 0:
-                if not re.search(r'<title>%s</title>' % (start), l):
-                    xmlfile2.write(prev)
-                else:
-                    break
            c += 1
-            prev = l
-        xmlfile.close()
-        xmlfile2.close()
-        # subst xml with xml2
-        # remove previous xml dump
-        os.remove('%s/%s' % (config['path'], xmlfilename))
-        # move correctly truncated dump to its real name
-        os.rename(
-            '%s/%s2' %
-            (config['path'], xmlfilename), '%s/%s' %
-            (config['path'], xmlfilename)
-        )
+            if re.search(r'<title>%s</title>' % (start), l):
+                # Done searching. We try to truncate the file at this point:
+                # everything should be removed from the line before <title>,
+                # that is the last c+1 lines AKA lines from EOF - c to EOF.
+                # TODO: do something for users without GNU ed; replace os.
+                # Try file.seek and file.truncate in the generator again?
+                os.system("(echo '$-%d,$d'; echo wq ) | ed %s/%s" \
+                    % (c, config['path'], xmlfilename) )
+                print "Last %d lines removed." % (c+1)
+                break
    else:
        # requested complete xml dump
        lock = False
@ -672,7 +670,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):

    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
-    for title in titles:
+    for title in readTitles(config, start):
        if not title.strip():
            continue
        if title == start:  # start downloading from start, included
@ -701,19 +699,59 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename

-
-def saveTitles(config={}, titles=[]):
-    """ Save title list in a file """
+def readTitles(config={}, start=None):
+    """ Read title list from a file, from the title "start" """

    titlesfilename = '%s-%s-titles.txt' % (
        domain2prefix(config=config), config['date'])
-    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
-    output = u"%s\n--END--" % ('\n'.join(titles))
-    titlesfile.write(output.encode('utf-8'))
-    titlesfile.close()
+    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')

-    print 'Titles saved at...', titlesfilename
+    seeking = False
+    if start:
+        seeking = True

+    with titlesfile as f:
+        for line in f:
+            if line.strip() == '--END--':
+                break
+            elif seeking and line.strip() != start:
+                continue
+            elif seeking and line.strip() == start:
+                seeking = False
+                yield line.strip()
+            else:
+                yield line.strip()
+
+def reverse_readline(filename, buf_size=8192):
+    """a generator that returns the lines of a file in reverse order"""
+    # Original code by srohde, abdus_salam: cc by-sa 3.0
+    # http://stackoverflow.com/a/23646049/718903
+    with open(filename) as fh:
+        segment = None
+        offset = 0
+        fh.seek(0, os.SEEK_END)
+        total_size = remaining_size = fh.tell()
+        while remaining_size > 0:
+            offset = min(total_size, offset + buf_size)
+            fh.seek(-offset, os.SEEK_END)
+            buffer = fh.read(min(remaining_size, buf_size))
+            remaining_size -= buf_size
+            lines = buffer.split('\n')
+            # the first line of the buffer is probably not a complete line so
+            # we'll save it and append it to the last line of the next buffer
+            # we read
+            if segment is not None:
+                # if the previous chunk starts right from the beginning of line
+                # do not concat the segment to the last line of new chunk
+                # instead, yield the segment first 
+                if buffer[-1] is not '\n':
+                    lines[-1] += segment
+                else:
+                    yield segment
+            segment = lines[0]
+            for index in range(len(lines) - 1, 0, -1):
+                yield lines[index]
+        yield segment

 def saveImageNames(config={}, images=[], session=None):
    """ Save image list in a file, including filename, url and uploader """
@ -1525,12 +1563,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None):


 def createNewDump(config={}, other={}):
-    titles = []
    images = []
    print 'Trying generating a new dump into a new directory...'
    if config['xml']:
-        titles += getPageTitles(config=config, session=other['session'])
-        saveTitles(config=config, titles=titles)
+        getPageTitles(config=config, session=other['session'])
+        titles=readTitles(config)
        generateXMLDump(config=config, titles=titles, session=other['session'])
        checkXMLIntegrity(
            config=config,
@ -1549,23 +1586,21 @@ def createNewDump(config={}, other={}):


 def resumePreviousDump(config={}, other={}):
-    titles = []
    images = []
    print 'Resuming previous dump process...'
    if config['xml']:
-        # load titles
-        lasttitle = ''
+        titles=readTitles(config)
        try:
-            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(
-                config=config, session=other['session']), config['date']), 'r')
-            raw = unicode(f.read(), 'utf-8')
-            titles = raw.split('\n')
-            lasttitle = titles[-1]
-            if not lasttitle:  # empty line at EOF ?
-                lasttitle = titles[-2]
-            f.close()
+            lasttitles = reverse_readline('%s/%s-%s-titles.txt' %
+                ( config['path'],
+                domain2prefix( config=config, session=other['session'] ),
+                config['date'])
+                )
+            lasttitle=lasttitles.next()
+            if lasttitle == '':
+                lasttitle=lasttitles.next()
        except:
-            pass  # probably file doesnot exists
+            pass  # probably file does not exists
        if lasttitle == '--END--':
            # titles list is complete
            print 'Title list was completed in the previous session'
@ -1573,13 +1608,13 @@ def resumePreviousDump(config={}, other={}):
            print 'Title list is incomplete. Reloading...'
            # do not resume, reload, to avoid inconsistences, deleted pages or
            # so
-            titles = getPageTitles(config=config, session=other['session'])
-            saveTitles(config=config, titles=titles)
+            getPageTitles(config=config, session=other['session'])
+
        # checking xml dump
        xmliscomplete = False
-        lastxmltitle = ''
+        lastxmltitle = None
        try:
-            f = open(
+            f = reverse_readline(
                '%s/%s-%s-%s.xml' %
                (config['path'],
                 domain2prefix(
@ -1587,27 +1622,26 @@ def resumePreviousDump(config={}, other={}):
                    session=other['session']),
                    config['date'],
                    config['curonly'] and 'current' or 'history'),
-                'r')
+                )
            for l in f:
-                if re.findall('</mediawiki>', l):
+                if l == '</mediawiki>':
                    # xml dump is complete
                    xmliscomplete = True
                    break
-                # weird if found more than 1, but maybe
-                xmltitles = re.findall(r'<title>([^<]+)</title>', l)
-                if xmltitles:
-                    lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
-            f.close()
+
+                xmltitle = re.search(r'<title>([^<]+)</title>', l)
+                if xmltitle:
+                    lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
+                    break
        except:
-            pass  # probably file doesnot exists
-        # removing --END-- before getXMLs
-        while titles and titles[-1] in ['', '--END--']:
-            titles = titles[:-1]
+            pass  # probably file does not exists
+
        if xmliscomplete:
            print 'XML dump was completed in the previous session'
        elif lastxmltitle:
            # resuming...
            print 'Resuming XML dump from "%s"' % (lastxmltitle)
+            titles = readTitles(config, start=lastxmltitle)
            generateXMLDump(
                config=config,
                titles=titles,
@ -1616,6 +1650,7 @@ def resumePreviousDump(config={}, other={}):
        else:
            # corrupt? only has XML header?
            print 'XML is corrupt? Regenerating...'
+            titles = readTitles(config)
            generateXMLDump(
                config=config, titles=titles, session=other['session'])