changed getXMLPage() into a generator

The program tended to run out of memory when processing very large pages (i.e., pages with extremely large numbers of revisions or pages with large numbers of very large revisions). This mitigates the problem by changing getXMLPage() into a generator which allows us to write pages after each request to the API. This requied changes to the getXMLPage() function and also changes to other parts of the code that called it. Additionally, when the function was called, it's text was checked in several ways. This required a few changes including a running tally of revisions instead of post hoc check and it required error checking being moved into a Exception rather than just an if statement that looked at the final result.
9 years ago · 145b2eaaf4
parent a1921f0919
commit 145b2eaaf4
1 changed files with 29 additions and 22 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -45,6 +45,8 @@ import urllib

 __VERSION__ = '0.3.0-alpha'  # major, minor, micro: semver.org

+class PageMissingError(Exception):
+    pass

 def getVersion():
    return(__VERSION__)
@ -387,10 +389,10 @@ def getXMLHeader(config={}, session=None):
    # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
    # xmlns:x....
    randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
-    xml = getXMLPage(
-        config=config, title=randomtitle, verbose=False, session=session)
-    header = xml.split('</mediawiki>')[0]
-    if not xml:
+    try:
+        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+        header = xml.split('</mediawiki>')[0]
+    except PageMissingError:
        print 'XML export on this wiki is broken, quitting.'
        sys.exit()
    return header
@ -399,12 +401,7 @@ def getXMLHeader(config={}, session=None):
 def getXMLFileDesc(config={}, title='', session=None):
    """ Get XML for image description page """
    config['curonly'] = 1  # tricky to get only the most recent desc
-    return getXMLPage(
-        config=config,
-        title=title,
-        verbose=False,
-        session=session
-    )
+    return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))


 def getUserAgent():
@ -510,10 +507,18 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
        params['templates'] = 1

    xml = getXMLPageCore(params=params, config=config, session=session)
+    if not xml:
+        raise PageMissingError
+
+    yield xml.split("</page>")[0]

    # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
    # else, warning about Special:Export truncating large page histories
    r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+
+    numberofedits = 0
+    numberofedits += len(re.findall(r_timestamp, xml))
+
    # search for timestamps in xml to avoid analysing empty pages like
    # Special:Allpages and the random one
    if not config['curonly'] and re.search(r_timestamp, xml):
@ -546,26 +551,27 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
                    """
                    # offset is OK in this wiki, merge with the previous chunk
                    # of this page history and continue
-                    xml = xml.split(
-                        '</page>')[0] + '    <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    xml2 = xml2.split("</page>")[0]
+                    yield '  <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    xml = xml2
+                    numberofedits += len(re.findall(r_timestamp, xml))
            else:
                params['offset'] = ''  # no more edits in this page history
+    yield "</page>\n"

    if verbose:
-        numberofedits = len(re.findall(r_timestamp, xml))
        if (numberofedits == 1):
-            print '    %s, 1 edit' % (title.encode('utf-8'))
+           print '    %s, 1 edit' % (title.encode('utf-8'))
        else:
-            print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)
-
-    return xml
+           print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)


 def cleanXML(xml=''):
    """ Trim redundant info """
    # do not touch XML codification, leave AS IS
-    if re.search(r'</siteinfo>\n', xml) and re.search(r'</mediawiki>', xml):
+    if re.search(r'</siteinfo>\n', xml):
        xml = xml.split('</siteinfo>\n')[1]
+    if re.search(r'</mediawiki>', xml):
        xml = xml.split('</mediawiki>')[0]
    return xml

@ -627,9 +633,11 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
        delay(config=config, session=session)
        if c % 10 == 0:
            print 'Downloaded %d pages' % (c)
-        xml = getXMLPage(config=config, title=title, session=session)
-        xml = cleanXML(xml=xml)
-        if not xml:
+        try:
+            for xml in getXMLPage(config=config, title=title, session=session):
+                xml = cleanXML(xml=xml)
+                xmlfile.write(xml.encode('utf-8'))
+        except PageMissingError:
            logerror(
                config=config,
                text=u'The page "%s" was missing in the wiki (probably deleted)' %
@ -639,7 +647,6 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
        # an empty string due to a deleted page (logged in errors log) or
        # an empty string due to an error while retrieving the page from server
        # (logged in errors log)
-        xmlfile.write(xml.encode('utf-8'))
        c += 1
    xmlfile.write(footer)
    xmlfile.close()