From 145b2eaaf40fb4b422de4d0f32817c2e87ee1de9 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Fri, 6 Feb 2015 17:19:24 -0800 Subject: [PATCH] changed getXMLPage() into a generator The program tended to run out of memory when processing very large pages (i.e., pages with extremely large numbers of revisions or pages with large numbers of very large revisions). This mitigates the problem by changing getXMLPage() into a generator which allows us to write pages after each request to the API. This requied changes to the getXMLPage() function and also changes to other parts of the code that called it. Additionally, when the function was called, it's text was checked in several ways. This required a few changes including a running tally of revisions instead of post hoc check and it required error checking being moved into a Exception rather than just an if statement that looked at the final result. --- dumpgenerator.py | 51 +++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 8a55e0f..1c99c24 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -45,6 +45,8 @@ import urllib __VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org +class PageMissingError(Exception): + pass def getVersion(): return(__VERSION__) @@ -387,10 +389,10 @@ def getXMLHeader(config={}, session=None): # similar to: ')[0] - if not xml: + try: + xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) + header = xml.split('')[0] + except PageMissingError: print 'XML export on this wiki is broken, quitting.' sys.exit() return header @@ -399,12 +401,7 @@ def getXMLHeader(config={}, session=None): def getXMLFileDesc(config={}, title='', session=None): """ Get XML for image description page """ config['curonly'] = 1 # tricky to get only the most recent desc - return getXMLPage( - config=config, - title=title, - verbose=False, - session=session - ) + return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)])) def getUserAgent(): @@ -510,10 +507,18 @@ def getXMLPage(config={}, title='', verbose=True, session=None): params['templates'] = 1 xml = getXMLPageCore(params=params, config=config, session=session) + if not xml: + raise PageMissingError + + yield xml.split("")[0] # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available # else, warning about Special:Export truncating large page histories r_timestamp = r'([^<]+)' + + numberofedits = 0 + numberofedits += len(re.findall(r_timestamp, xml)) + # search for timestamps in xml to avoid analysing empty pages like # Special:Allpages and the random one if not config['curonly'] and re.search(r_timestamp, xml): @@ -546,26 +551,27 @@ def getXMLPage(config={}, title='', verbose=True, session=None): """ # offset is OK in this wiki, merge with the previous chunk # of this page history and continue - xml = xml.split( - '')[0] + ' ' + (''.join(xml2.split('')[1:])) + xml2 = xml2.split("")[0] + yield ' ' + (''.join(xml2.split('')[1:])) + xml = xml2 + numberofedits += len(re.findall(r_timestamp, xml)) else: params['offset'] = '' # no more edits in this page history + yield "\n" if verbose: - numberofedits = len(re.findall(r_timestamp, xml)) if (numberofedits == 1): - print ' %s, 1 edit' % (title.encode('utf-8')) + print ' %s, 1 edit' % (title.encode('utf-8')) else: - print ' %s, %d edits' % (title.encode('utf-8'), numberofedits) - - return xml + print ' %s, %d edits' % (title.encode('utf-8'), numberofedits) def cleanXML(xml=''): """ Trim redundant info """ # do not touch XML codification, leave AS IS - if re.search(r'\n', xml) and re.search(r'', xml): + if re.search(r'\n', xml): xml = xml.split('\n')[1] + if re.search(r'', xml): xml = xml.split('')[0] return xml @@ -627,9 +633,11 @@ def generateXMLDump(config={}, titles=[], start='', session=None): delay(config=config, session=session) if c % 10 == 0: print 'Downloaded %d pages' % (c) - xml = getXMLPage(config=config, title=title, session=session) - xml = cleanXML(xml=xml) - if not xml: + try: + for xml in getXMLPage(config=config, title=title, session=session): + xml = cleanXML(xml=xml) + xmlfile.write(xml.encode('utf-8')) + except PageMissingError: logerror( config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % @@ -639,7 +647,6 @@ def generateXMLDump(config={}, titles=[], start='', session=None): # an empty string due to a deleted page (logged in errors log) or # an empty string due to an error while retrieving the page from server # (logged in errors log) - xmlfile.write(xml.encode('utf-8')) c += 1 xmlfile.write(footer) xmlfile.close()