From 145b2eaaf40fb4b422de4d0f32817c2e87ee1de9 Mon Sep 17 00:00:00 2001
From: Benjamin Mako Hill <mako@atdot.cc>
Date: Fri, 6 Feb 2015 17:19:24 -0800
Subject: [PATCH] changed getXMLPage() into a generator

The program tended to run out of memory when processing very large pages (i.e.,
pages with extremely large numbers of revisions or pages with large numbers of
very large revisions). This mitigates the problem by changing getXMLPage() into
a generator which allows us to write pages after each request to the API.

This requied changes to the getXMLPage() function and also changes to other
parts of the code that called it.

Additionally, when the function was called, it's text was checked in several
ways. This required a few changes including a running tally of revisions
instead of post hoc check and it required error checking being moved into a
Exception rather than just an if statement that looked at the final result.
---
 dumpgenerator.py | 51 +++++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 22 deletions(-)
diff --git a/dumpgenerator.py b/dumpgenerator.py
index 8a55e0f..1c99c24 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -45,6 +45,8 @@ import urllib
 
 __VERSION__ = '0.3.0-alpha'  # major, minor, micro: semver.org
 
+class PageMissingError(Exception):
+    pass
 
 def getVersion():
     return(__VERSION__)
@@ -387,10 +389,10 @@ def getXMLHeader(config={}, session=None):
     # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
     # xmlns:x....
     randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
-    xml = getXMLPage(
-        config=config, title=randomtitle, verbose=False, session=session)
-    header = xml.split('</mediawiki>')[0]
-    if not xml:
+    try:
+        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+        header = xml.split('</mediawiki>')[0]
+    except PageMissingError:
         print 'XML export on this wiki is broken, quitting.'
         sys.exit()
     return header
@@ -399,12 +401,7 @@ def getXMLHeader(config={}, session=None):
 def getXMLFileDesc(config={}, title='', session=None):
     """ Get XML for image description page """
     config['curonly'] = 1  # tricky to get only the most recent desc
-    return getXMLPage(
-        config=config,
-        title=title,
-        verbose=False,
-        session=session
-    )
+    return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
 
 
 def getUserAgent():
@@ -510,10 +507,18 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
         params['templates'] = 1
 
     xml = getXMLPageCore(params=params, config=config, session=session)
+    if not xml:
+        raise PageMissingError
+
+    yield xml.split("</page>")[0]
 
     # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
     # else, warning about Special:Export truncating large page histories
     r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+
+    numberofedits = 0
+    numberofedits += len(re.findall(r_timestamp, xml))
+
     # search for timestamps in xml to avoid analysing empty pages like
     # Special:Allpages and the random one
     if not config['curonly'] and re.search(r_timestamp, xml):
@@ -546,26 +551,27 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
                     """
                     # offset is OK in this wiki, merge with the previous chunk
                     # of this page history and continue
-                    xml = xml.split(
-                        '</page>')[0] + '    <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    xml2 = xml2.split("</page>")[0]
+                    yield '  <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    xml = xml2
+                    numberofedits += len(re.findall(r_timestamp, xml))
             else:
                 params['offset'] = ''  # no more edits in this page history
+    yield "</page>\n"
 
     if verbose:
-        numberofedits = len(re.findall(r_timestamp, xml))
         if (numberofedits == 1):
-            print '    %s, 1 edit' % (title.encode('utf-8'))
+           print '    %s, 1 edit' % (title.encode('utf-8'))
         else:
-            print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)
-
-    return xml
+           print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)
 
 
 def cleanXML(xml=''):
     """ Trim redundant info """
     # do not touch XML codification, leave AS IS
-    if re.search(r'</siteinfo>\n', xml) and re.search(r'</mediawiki>', xml):
+    if re.search(r'</siteinfo>\n', xml):
         xml = xml.split('</siteinfo>\n')[1]
+    if re.search(r'</mediawiki>', xml):
         xml = xml.split('</mediawiki>')[0]
     return xml
 
@@ -627,9 +633,11 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
         delay(config=config, session=session)
         if c % 10 == 0:
             print 'Downloaded %d pages' % (c)
-        xml = getXMLPage(config=config, title=title, session=session)
-        xml = cleanXML(xml=xml)
-        if not xml:
+        try:
+            for xml in getXMLPage(config=config, title=title, session=session):
+                xml = cleanXML(xml=xml)
+                xmlfile.write(xml.encode('utf-8'))
+        except PageMissingError:
             logerror(
                 config=config,
                 text=u'The page "%s" was missing in the wiki (probably deleted)' %
@@ -639,7 +647,6 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
         # an empty string due to a deleted page (logged in errors log) or
         # an empty string due to an error while retrieving the page from server
         # (logged in errors log)
-        xmlfile.write(xml.encode('utf-8'))
         c += 1
     xmlfile.write(footer)
     xmlfile.close()