Actually allow resuming huge or broken XML dumps

* Log "XML export on this wiki is broken, quitting." to the error
  file so that grepping reveals which dumps were interrupted so.
* Automatically reduce export size for a page when downloading the
  entire history at once results in a MemoryError.
* Truncate the file with a pythonic method (.seek and .truncate)
  while reading from the end, by making reverse_readline() a weird
  hybrid to avoid an actual coroutine.
pull/228/head
Federico Leva 9 years ago
parent 9168a66a54
commit d4fd745498

@ -444,6 +444,7 @@ def getXMLHeader(config={}, session=None):
header = xml.split('</mediawiki>')[0]
if not re.match("<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.'
logerror(u'XML export on this wiki is broken, quitting.')
sys.exit()
return header, config
@ -615,7 +616,12 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
# offset is OK in this wiki, merge with the previous chunk
# of this page history and continue
xml2 = xml2.split("</page>")[0]
yield ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
try:
yield ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
except MemoryError:
print "The page's history exceeds our memory, halving limit."
params['limit'] = params['limit'] / 2
continue
xml = xml2
numberofedits += len(re.findall(r_timestamp, xml))
else:
@ -652,21 +658,9 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile = ''
lock = True
if start:
print "Removing the last chunk of past XML dump: it is probably incomplete"
xmlfile = reverse_readline('%s/%s' % (config['path'], xmlfilename))
c = 0
for l in xmlfile:
c += 1
if re.search(r'<title>%s</title>' % (start), l):
# Done searching. We try to truncate the file at this point:
# everything should be removed from the line before <title>,
# that is the last c+1 lines AKA lines from EOF - c to EOF.
# TODO: do something for users without GNU ed; replace os.
# Try file.seek and file.truncate in the generator again?
os.system("(echo '$-%d,$d'; echo wq ) | ed %s/%s" \
% (c, config['path'], xmlfilename) )
print "Last %d lines removed." % (c+1)
break
print "Removing the last chunk of past XML dump: it is probably incomplete."
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
@ -728,11 +722,11 @@ def readTitles(config={}, start=None):
else:
yield line.strip()
def reverse_readline(filename, buf_size=8192):
def reverse_readline(filename, buf_size=8192, truncate=False):
"""a generator that returns the lines of a file in reverse order"""
# Original code by srohde, abdus_salam: cc by-sa 3.0
# http://stackoverflow.com/a/23646049/718903
with open(filename) as fh:
with open(filename, 'r+') as fh:
segment = None
offset = 0
fh.seek(0, os.SEEK_END)
@ -753,10 +747,22 @@ def reverse_readline(filename, buf_size=8192):
if buffer[-1] is not '\n':
lines[-1] += segment
else:
yield segment
if truncate and '</page>' in segment:
pages = buffer.split('</page>')
fh.seek(-offset+buf_size-len(pages[-1]), os.SEEK_END)
fh.truncate
raise StopIteration
else:
yield segment
segment = lines[0]
for index in range(len(lines) - 1, 0, -1):
yield lines[index]
if truncate and '</page>' in segment:
pages = buffer.split('</page>')
fh.seek(-offset-len(pages[-1]), os.SEEK_END)
fh.truncate
raise StopIteration
else:
yield lines[index]
yield segment
def saveImageNames(config={}, images=[], session=None):

Loading…
Cancel
Save