diff --git a/dumpgenerator.py b/dumpgenerator.py index f0e141c..4268eb5 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -257,19 +257,19 @@ def getPageTitlesAPI(config={}, session=None): # Hack for old versions of MediaWiki API where result is dict if isinstance(allpages, dict): allpages = allpages.values() - titles += [page['title'] - for page in allpages] + for page in allpages: + yield page['title'] + c += len(allpages) + if len(titles) != len(set(titles)): # probably we are in a loop, server returning dupe titles, stop # it print 'Probably a loop, finishing' titles = list(set(titles)) apfrom = '' - c += len(allpages) + delay(config=config, session=session) print ' %d titles retrieved in the namespace %d' % (c, namespace) - return titles - def getPageTitlesScraper(config={}, session=None): """ """ @@ -368,13 +368,21 @@ def getPageTitles(config={}, session=None): elif 'index' in config and config['index']: titles = getPageTitlesScraper(config=config, session=session) - # removing dupes (e.g. in CZ appears Widget:AddThis two times (main - # namespace and widget namespace)) - titles = sorted(set(titles)) - - print '%d page titles loaded' % (len(titles)) - return titles + titlesfilename = '%s-%s-titles.txt' % ( + domain2prefix(config=config), config['date']) + titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'a') + c = 0 + for title in titles: + titlesfile.write(title.encode('utf-8') + "\n") + c += 1 + # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times: + # main namespace and widget namespace. + # We can use sort -u in UNIX, but is it worth it? + titlesfile.write(u'--END--\n') + titlesfile.close() + print 'Titles saved at...', titlesfilename + print '%d page titles loaded' % (c) def getImageNames(config={}, session=None): """ Get list of image names """ @@ -610,9 +618,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None): if verbose: if (numberofedits == 1): - print ' %s, 1 edit' % (title.encode('utf-8')) + print ' %s, 1 edit' % (title.encode('utf-8').strip()) else: - print ' %s, %d edits' % (title.encode('utf-8'), numberofedits) + print ' %s, %d edits' % (title.encode('utf-8').strip(), numberofedits) def cleanXML(xml=''): @@ -625,8 +633,9 @@ def cleanXML(xml=''): return xml -def generateXMLDump(config={}, titles=[], start='', session=None): +def generateXMLDump(config={}, titles=[], start=None, session=None): """ Generates a XML dump for a list of titles """ + # TODO: titles is now unused. print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') header, config = getXMLHeader(config=config, session=session) @@ -637,32 +646,21 @@ def generateXMLDump(config={}, titles=[], start='', session=None): xmlfile = '' lock = True if start: - # remove the last chunk of xml dump (it is probably incomplete) - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r') - xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w') - prev = '' + print "Removing the last chunk of past XML dump: it is probably incomplete" + xmlfile = reverse_readline('%s/%s' % (config['path'], xmlfilename)) c = 0 for l in xmlfile: - # removing \n until end of file - # lock to avoid write an empty line at the begining of file - if c != 0: - if not re.search(r'%s' % (start), l): - xmlfile2.write(prev) - else: - break c += 1 - prev = l - xmlfile.close() - xmlfile2.close() - # subst xml with xml2 - # remove previous xml dump - os.remove('%s/%s' % (config['path'], xmlfilename)) - # move correctly truncated dump to its real name - os.rename( - '%s/%s2' % - (config['path'], xmlfilename), '%s/%s' % - (config['path'], xmlfilename) - ) + if re.search(r'%s' % (start), l): + # Done searching. We try to truncate the file at this point: + # everything should be removed from the line before , + # that is the last c+1 lines AKA lines from EOF - c to EOF. + # TODO: do something for users without GNU ed; replace os. + # Try file.seek and file.truncate in the generator again? + os.system("(echo '$-%d,$d'; echo wq ) | ed %s/%s" \ + % (c, config['path'], xmlfilename) ) + print "Last %d lines removed." % (c+1) + break else: # requested complete xml dump lock = False @@ -672,7 +670,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None): xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') c = 1 - for title in titles: + for title in readTitles(config, start): if not title.strip(): continue if title == start: # start downloading from start, included @@ -701,19 +699,59 @@ def generateXMLDump(config={}, titles=[], start='', session=None): xmlfile.close() print 'XML dump saved at...', xmlfilename - -def saveTitles(config={}, titles=[]): - """ Save title list in a file """ +def readTitles(config={}, start=None): + """ Read title list from a file, from the title "start" """ titlesfilename = '%s-%s-titles.txt' % ( domain2prefix(config=config), config['date']) - titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w') - output = u"%s\n--END--" % ('\n'.join(titles)) - titlesfile.write(output.encode('utf-8')) - titlesfile.close() + titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r') - print 'Titles saved at...', titlesfilename + seeking = False + if start: + seeking = True + with titlesfile as f: + for line in f: + if line.strip() == '--END--': + break + elif seeking and line.strip() != start: + continue + elif seeking and line.strip() == start: + seeking = False + yield line.strip() + else: + yield line.strip() + +def reverse_readline(filename, buf_size=8192): + """a generator that returns the lines of a file in reverse order""" + # Original code by srohde, abdus_salam: cc by-sa 3.0 + # http://stackoverflow.com/a/23646049/718903 + with open(filename) as fh: + segment = None + offset = 0 + fh.seek(0, os.SEEK_END) + total_size = remaining_size = fh.tell() + while remaining_size > 0: + offset = min(total_size, offset + buf_size) + fh.seek(-offset, os.SEEK_END) + buffer = fh.read(min(remaining_size, buf_size)) + remaining_size -= buf_size + lines = buffer.split('\n') + # the first line of the buffer is probably not a complete line so + # we'll save it and append it to the last line of the next buffer + # we read + if segment is not None: + # if the previous chunk starts right from the beginning of line + # do not concat the segment to the last line of new chunk + # instead, yield the segment first + if buffer[-1] is not '\n': + lines[-1] += segment + else: + yield segment + segment = lines[0] + for index in range(len(lines) - 1, 0, -1): + yield lines[index] + yield segment def saveImageNames(config={}, images=[], session=None): """ Save image list in a file, including filename, url and uploader """ @@ -1525,12 +1563,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None): def createNewDump(config={}, other={}): - titles = [] images = [] print 'Trying generating a new dump into a new directory...' if config['xml']: - titles += getPageTitles(config=config, session=other['session']) - saveTitles(config=config, titles=titles) + getPageTitles(config=config, session=other['session']) + titles=readTitles(config) generateXMLDump(config=config, titles=titles, session=other['session']) checkXMLIntegrity( config=config, @@ -1549,23 +1586,21 @@ def createNewDump(config={}, other={}): def resumePreviousDump(config={}, other={}): - titles = [] images = [] print 'Resuming previous dump process...' if config['xml']: - # load titles - lasttitle = '' + titles=readTitles(config) try: - f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix( - config=config, session=other['session']), config['date']), 'r') - raw = unicode(f.read(), 'utf-8') - titles = raw.split('\n') - lasttitle = titles[-1] - if not lasttitle: # empty line at EOF ? - lasttitle = titles[-2] - f.close() + lasttitles = reverse_readline('%s/%s-%s-titles.txt' % + ( config['path'], + domain2prefix( config=config, session=other['session'] ), + config['date']) + ) + lasttitle=lasttitles.next() + if lasttitle == '': + lasttitle=lasttitles.next() except: - pass # probably file doesnot exists + pass # probably file does not exists if lasttitle == '--END--': # titles list is complete print 'Title list was completed in the previous session' @@ -1573,13 +1608,13 @@ def resumePreviousDump(config={}, other={}): print 'Title list is incomplete. Reloading...' # do not resume, reload, to avoid inconsistences, deleted pages or # so - titles = getPageTitles(config=config, session=other['session']) - saveTitles(config=config, titles=titles) + getPageTitles(config=config, session=other['session']) + # checking xml dump xmliscomplete = False - lastxmltitle = '' + lastxmltitle = None try: - f = open( + f = reverse_readline( '%s/%s-%s-%s.xml' % (config['path'], domain2prefix( @@ -1587,27 +1622,26 @@ def resumePreviousDump(config={}, other={}): session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), - 'r') + ) for l in f: - if re.findall('</mediawiki>', l): + if l == '</mediawiki>': # xml dump is complete xmliscomplete = True break - # weird if found more than 1, but maybe - xmltitles = re.findall(r'<title>([^<]+)', l) - if xmltitles: - lastxmltitle = undoHTMLEntities(text=xmltitles[-1]) - f.close() + + xmltitle = re.search(r'([^<]+)', l) + if xmltitle: + lastxmltitle = undoHTMLEntities(text=xmltitle.group(1)) + break except: - pass # probably file doesnot exists - # removing --END-- before getXMLs - while titles and titles[-1] in ['', '--END--']: - titles = titles[:-1] + pass # probably file does not exists + if xmliscomplete: print 'XML dump was completed in the previous session' elif lastxmltitle: # resuming... print 'Resuming XML dump from "%s"' % (lastxmltitle) + titles = readTitles(config, start=lastxmltitle) generateXMLDump( config=config, titles=titles, @@ -1616,6 +1650,7 @@ def resumePreviousDump(config={}, other={}): else: # corrupt? only has XML header? print 'XML is corrupt? Regenerating...' + titles = readTitles(config) generateXMLDump( config=config, titles=titles, session=other['session'])