Fix regex OOM issue on large pages

pull/264/head
Daniel Oaks 9 years ago
parent 679dee9901
commit 61c9bc1380

@ -573,19 +573,31 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
if 'templates' in config and config['templates']:
params['templates'] = 1
xml = getXMLPageCore(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config['index'])
if not "</page>" in xml:
raise PageMissingError(params['title'], xml)
else:
# strip these sha1s sums which keep showing up in the export and
# which are invalid for the XML schema (they only apply to
# revisions)
xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
while True:
try:
xml = getXMLPageCore(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config['index'])
if "</page>" not in xml:
raise PageMissingError(params['title'], xml)
else:
# do the split before the regexes because .split throws a
# MemoryError if it runs out of memory, regexes just kills the
# process outright, this lets us download larger pages
xml = xml.split("</page>")[0]
# strip these sha1s sums which keep showing up in the export and
# which are invalid for the XML schema (they only apply to
# revisions)
xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
break
except MemoryError:
print "The page's history exceeds our memory, halving limit."
params['limit'] = params['limit'] / 2
continue
yield xml.split("</page>")[0]
yield xml
# if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
# else, warning about Special:Export truncating large page histories

Loading…
Cancel
Save