Resume and list titles without keeping everything in memory

Approach suggested by @makoshark, finally found the time to start
implementing it.
* Do not produce and save the titles list all at once. Instead, use
  the scraper and API as generators and save titles on the go. Also,
  try to start the generator from the appropriate title.
  For now the title sorting is not implemented. Pages will be in the
  order given by namespace ID, then page name.
* When resuming, read both the title list and the XML file from the
  end rather than the beginning. If the correct terminator is
  present, only one line needs to be read.
* In both cases, use a generator instead of a huge list in memory.
* Also truncate the resumed XML without writing it from scratch.
  For now using GNU ed: very compact, though shelling out is ugly.
  I gave up on using file.seek and file.truncate to avoid reading the
  whole file from the beginning or complicating reverse_readline()
  with more offset calculations.

This should avoid MemoryError in most cases.

Tested by running a dump over a 1.24 wiki with 11 pages: a complete
dump and a resumed dump from a dump interrupted with ctrl-c.
pull/228/head
Federico Leva 9 years ago
parent bdc7c9bf06
commit 14ce5f2c1b

@ -257,19 +257,19 @@ def getPageTitlesAPI(config={}, session=None):
# Hack for old versions of MediaWiki API where result is dict
if isinstance(allpages, dict):
allpages = allpages.values()
titles += [page['title']
for page in allpages]
for page in allpages:
yield page['title']
c += len(allpages)
if len(titles) != len(set(titles)):
# probably we are in a loop, server returning dupe titles, stop
# it
print 'Probably a loop, finishing'
titles = list(set(titles))
apfrom = ''
c += len(allpages)
delay(config=config, session=session)
print ' %d titles retrieved in the namespace %d' % (c, namespace)
return titles
def getPageTitlesScraper(config={}, session=None):
""" """
@ -368,13 +368,21 @@ def getPageTitles(config={}, session=None):
elif 'index' in config and config['index']:
titles = getPageTitlesScraper(config=config, session=session)
# removing dupes (e.g. in CZ appears Widget:AddThis two times (main
# namespace and widget namespace))
titles = sorted(set(titles))
print '%d page titles loaded' % (len(titles))
return titles
titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date'])
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'a')
c = 0
for title in titles:
titlesfile.write(title.encode('utf-8') + "\n")
c += 1
# TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
# main namespace and widget namespace.
# We can use sort -u in UNIX, but is it worth it?
titlesfile.write(u'--END--\n')
titlesfile.close()
print 'Titles saved at...', titlesfilename
print '%d page titles loaded' % (c)
def getImageNames(config={}, session=None):
""" Get list of image names """
@ -610,9 +618,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
if verbose:
if (numberofedits == 1):
print ' %s, 1 edit' % (title.encode('utf-8'))
print ' %s, 1 edit' % (title.encode('utf-8').strip())
else:
print ' %s, %d edits' % (title.encode('utf-8'), numberofedits)
print ' %s, %d edits' % (title.encode('utf-8').strip(), numberofedits)
def cleanXML(xml=''):
@ -625,8 +633,9 @@ def cleanXML(xml=''):
return xml
def generateXMLDump(config={}, titles=[], start='', session=None):
def generateXMLDump(config={}, titles=[], start=None, session=None):
""" Generates a XML dump for a list of titles """
# TODO: titles is now unused.
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header, config = getXMLHeader(config=config, session=session)
@ -637,32 +646,21 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile = ''
lock = True
if start:
# remove the last chunk of xml dump (it is probably incomplete)
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r')
xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w')
prev = ''
print "Removing the last chunk of past XML dump: it is probably incomplete"
xmlfile = reverse_readline('%s/%s' % (config['path'], xmlfilename))
c = 0
for l in xmlfile:
# removing <page>\n until end of file
# lock to avoid write an empty line at the begining of file
if c != 0:
if not re.search(r'<title>%s</title>' % (start), l):
xmlfile2.write(prev)
else:
break
c += 1
prev = l
xmlfile.close()
xmlfile2.close()
# subst xml with xml2
# remove previous xml dump
os.remove('%s/%s' % (config['path'], xmlfilename))
# move correctly truncated dump to its real name
os.rename(
'%s/%s2' %
(config['path'], xmlfilename), '%s/%s' %
(config['path'], xmlfilename)
)
if re.search(r'<title>%s</title>' % (start), l):
# Done searching. We try to truncate the file at this point:
# everything should be removed from the line before <title>,
# that is the last c+1 lines AKA lines from EOF - c to EOF.
# TODO: do something for users without GNU ed; replace os.
# Try file.seek and file.truncate in the generator again?
os.system("(echo '$-%d,$d'; echo wq ) | ed %s/%s" \
% (c, config['path'], xmlfilename) )
print "Last %d lines removed." % (c+1)
break
else:
# requested complete xml dump
lock = False
@ -672,7 +670,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in titles:
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
@ -701,19 +699,59 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def saveTitles(config={}, titles=[]):
""" Save title list in a file """
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date'])
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
output = u"%s\n--END--" % ('\n'.join(titles))
titlesfile.write(output.encode('utf-8'))
titlesfile.close()
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')
print 'Titles saved at...', titlesfilename
seeking = False
if start:
seeking = True
with titlesfile as f:
for line in f:
if line.strip() == '--END--':
break
elif seeking and line.strip() != start:
continue
elif seeking and line.strip() == start:
seeking = False
yield line.strip()
else:
yield line.strip()
def reverse_readline(filename, buf_size=8192):
"""a generator that returns the lines of a file in reverse order"""
# Original code by srohde, abdus_salam: cc by-sa 3.0
# http://stackoverflow.com/a/23646049/718903
with open(filename) as fh:
segment = None
offset = 0
fh.seek(0, os.SEEK_END)
total_size = remaining_size = fh.tell()
while remaining_size > 0:
offset = min(total_size, offset + buf_size)
fh.seek(-offset, os.SEEK_END)
buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size
lines = buffer.split('\n')
# the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer
# we read
if segment is not None:
# if the previous chunk starts right from the beginning of line
# do not concat the segment to the last line of new chunk
# instead, yield the segment first
if buffer[-1] is not '\n':
lines[-1] += segment
else:
yield segment
segment = lines[0]
for index in range(len(lines) - 1, 0, -1):
yield lines[index]
yield segment
def saveImageNames(config={}, images=[], session=None):
""" Save image list in a file, including filename, url and uploader """
@ -1525,12 +1563,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
def createNewDump(config={}, other={}):
titles = []
images = []
print 'Trying generating a new dump into a new directory...'
if config['xml']:
titles += getPageTitles(config=config, session=other['session'])
saveTitles(config=config, titles=titles)
getPageTitles(config=config, session=other['session'])
titles=readTitles(config)
generateXMLDump(config=config, titles=titles, session=other['session'])
checkXMLIntegrity(
config=config,
@ -1549,23 +1586,21 @@ def createNewDump(config={}, other={}):
def resumePreviousDump(config={}, other={}):
titles = []
images = []
print 'Resuming previous dump process...'
if config['xml']:
# load titles
lasttitle = ''
titles=readTitles(config)
try:
f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(
config=config, session=other['session']), config['date']), 'r')
raw = unicode(f.read(), 'utf-8')
titles = raw.split('\n')
lasttitle = titles[-1]
if not lasttitle: # empty line at EOF ?
lasttitle = titles[-2]
f.close()
lasttitles = reverse_readline('%s/%s-%s-titles.txt' %
( config['path'],
domain2prefix( config=config, session=other['session'] ),
config['date'])
)
lasttitle=lasttitles.next()
if lasttitle == '':
lasttitle=lasttitles.next()
except:
pass # probably file doesnot exists
pass # probably file does not exists
if lasttitle == '--END--':
# titles list is complete
print 'Title list was completed in the previous session'
@ -1573,13 +1608,13 @@ def resumePreviousDump(config={}, other={}):
print 'Title list is incomplete. Reloading...'
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
titles = getPageTitles(config=config, session=other['session'])
saveTitles(config=config, titles=titles)
getPageTitles(config=config, session=other['session'])
# checking xml dump
xmliscomplete = False
lastxmltitle = ''
lastxmltitle = None
try:
f = open(
f = reverse_readline(
'%s/%s-%s-%s.xml' %
(config['path'],
domain2prefix(
@ -1587,27 +1622,26 @@ def resumePreviousDump(config={}, other={}):
session=other['session']),
config['date'],
config['curonly'] and 'current' or 'history'),
'r')
)
for l in f:
if re.findall('</mediawiki>', l):
if l == '</mediawiki>':
# xml dump is complete
xmliscomplete = True
break
# weird if found more than 1, but maybe
xmltitles = re.findall(r'<title>([^<]+)</title>', l)
if xmltitles:
lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
f.close()
xmltitle = re.search(r'<title>([^<]+)</title>', l)
if xmltitle:
lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
break
except:
pass # probably file doesnot exists
# removing --END-- before getXMLs
while titles and titles[-1] in ['', '--END--']:
titles = titles[:-1]
pass # probably file does not exists
if xmliscomplete:
print 'XML dump was completed in the previous session'
elif lastxmltitle:
# resuming...
print 'Resuming XML dump from "%s"' % (lastxmltitle)
titles = readTitles(config, start=lastxmltitle)
generateXMLDump(
config=config,
titles=titles,
@ -1616,6 +1650,7 @@ def resumePreviousDump(config={}, other={}):
else:
# corrupt? only has XML header?
print 'XML is corrupt? Regenerating...'
titles = readTitles(config)
generateXMLDump(
config=config, titles=titles, session=other['session'])

Loading…
Cancel
Save