diff --git a/dumpgenerator.py b/dumpgenerator.py index 23f3b79..df1b00c 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -19,6 +19,10 @@ # To learn more, read the documentation: # https://github.com/WikiTeam/wikiteam/wiki +try: + from kitchen.text.converters import getwriter +except ImportError: + print "Please install the kitchen module." import cookielib import cPickle import datetime @@ -42,6 +46,8 @@ except ImportError: sys.exit(1) import time import urllib +UTF8Writer = getwriter('utf8') +sys.stdout = UTF8Writer(sys.stdout) __VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org @@ -257,19 +263,19 @@ def getPageTitlesAPI(config={}, session=None): # Hack for old versions of MediaWiki API where result is dict if isinstance(allpages, dict): allpages = allpages.values() - titles += [page['title'] - for page in allpages] + for page in allpages: + yield page['title'] + c += len(allpages) + if len(titles) != len(set(titles)): # probably we are in a loop, server returning dupe titles, stop # it print 'Probably a loop, finishing' titles = list(set(titles)) apfrom = '' - c += len(allpages) + delay(config=config, session=session) print ' %d titles retrieved in the namespace %d' % (c, namespace) - return titles - def getPageTitlesScraper(config={}, session=None): """ """ @@ -368,13 +374,21 @@ def getPageTitles(config={}, session=None): elif 'index' in config and config['index']: titles = getPageTitlesScraper(config=config, session=session) - # removing dupes (e.g. in CZ appears Widget:AddThis two times (main - # namespace and widget namespace)) - titles = sorted(set(titles)) - - print '%d page titles loaded' % (len(titles)) - return titles + titlesfilename = '%s-%s-titles.txt' % ( + domain2prefix(config=config), config['date']) + titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'a') + c = 0 + for title in titles: + titlesfile.write(title.encode('utf-8') + "\n") + c += 1 + # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times: + # main namespace and widget namespace. + # We can use sort -u in UNIX, but is it worth it? + titlesfile.write(u'--END--\n') + titlesfile.close() + print 'Titles saved at...', titlesfilename + print '%d page titles loaded' % (c) def getImageNames(config={}, session=None): """ Get list of image names """ @@ -430,6 +444,7 @@ def getXMLHeader(config={}, session=None): header = xml.split('')[0] if not re.match(r"\s*")[0] - yield ' ' + (''.join(xml2.split('')[1:])) + try: + yield ' ' + (''.join(xml2.split('')[1:])) + except MemoryError: + print "The page's history exceeds our memory, halving limit." + params['limit'] = params['limit'] / 2 + continue xml = xml2 numberofedits += len(re.findall(r_timestamp, xml)) else: @@ -610,9 +630,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None): if verbose: if (numberofedits == 1): - print ' %s, 1 edit' % (title.encode('utf-8')) + print ' %s, 1 edit' % (title.strip()) else: - print ' %s, %d edits' % (title.encode('utf-8'), numberofedits) + print ' %s, %d edits' % (title.strip(), numberofedits) def cleanXML(xml=''): @@ -625,8 +645,9 @@ def cleanXML(xml=''): return xml -def generateXMLDump(config={}, titles=[], start='', session=None): +def generateXMLDump(config={}, titles=[], start=None, session=None): """ Generates a XML dump for a list of titles """ + # TODO: titles is now unused. print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') header, config = getXMLHeader(config=config, session=session) @@ -637,32 +658,9 @@ def generateXMLDump(config={}, titles=[], start='', session=None): xmlfile = '' lock = True if start: - # remove the last chunk of xml dump (it is probably incomplete) - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r') - xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w') - prev = '' - c = 0 - for l in xmlfile: - # removing \n until end of file - # lock to avoid write an empty line at the begining of file - if c != 0: - if not re.search(r'%s' % (start), l): - xmlfile2.write(prev) - else: - break - c += 1 - prev = l - xmlfile.close() - xmlfile2.close() - # subst xml with xml2 - # remove previous xml dump - os.remove('%s/%s' % (config['path'], xmlfilename)) - # move correctly truncated dump to its real name - os.rename( - '%s/%s2' % - (config['path'], xmlfilename), '%s/%s' % - (config['path'], xmlfilename) - ) + print "Removing the last chunk of past XML dump: it is probably incomplete." + for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True): + pass else: # requested complete xml dump lock = False @@ -672,7 +670,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None): xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') c = 1 - for title in titles: + for title in readTitles(config, start): if not title.strip(): continue if title == start: # start downloading from start, included @@ -690,7 +688,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None): logerror( config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % - (title) + (title.decode('utf-8')) ) # here, XML is a correct chunk or # an empty string due to a deleted page (logged in errors log) or @@ -701,19 +699,71 @@ def generateXMLDump(config={}, titles=[], start='', session=None): xmlfile.close() print 'XML dump saved at...', xmlfilename - -def saveTitles(config={}, titles=[]): - """ Save title list in a file """ +def readTitles(config={}, start=None): + """ Read title list from a file, from the title "start" """ titlesfilename = '%s-%s-titles.txt' % ( domain2prefix(config=config), config['date']) - titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w') - output = u"%s\n--END--" % ('\n'.join(titles)) - titlesfile.write(output.encode('utf-8')) - titlesfile.close() + titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r') - print 'Titles saved at...', titlesfilename + seeking = False + if start: + seeking = True + with titlesfile as f: + for line in f: + if line.strip() == '--END--': + break + elif seeking and line.strip() != start: + continue + elif seeking and line.strip() == start: + seeking = False + yield line.strip() + else: + yield line.strip() + +def reverse_readline(filename, buf_size=8192, truncate=False): + """a generator that returns the lines of a file in reverse order""" + # Original code by srohde, abdus_salam: cc by-sa 3.0 + # http://stackoverflow.com/a/23646049/718903 + with open(filename, 'r+') as fh: + segment = None + offset = 0 + fh.seek(0, os.SEEK_END) + total_size = remaining_size = fh.tell() + while remaining_size > 0: + offset = min(total_size, offset + buf_size) + fh.seek(-offset, os.SEEK_END) + buffer = fh.read(min(remaining_size, buf_size)) + remaining_size -= buf_size + lines = buffer.split('\n') + # the first line of the buffer is probably not a complete line so + # we'll save it and append it to the last line of the next buffer + # we read + if segment is not None: + # if the previous chunk starts right from the beginning of line + # do not concat the segment to the last line of new chunk + # instead, yield the segment first + if buffer[-1] is not '\n': + lines[-1] += segment + else: + if truncate and '' in segment: + pages = buffer.split('') + fh.seek(-offset+buf_size-len(pages[-1]), os.SEEK_END) + fh.truncate + raise StopIteration + else: + yield segment + segment = lines[0] + for index in range(len(lines) - 1, 0, -1): + if truncate and '' in segment: + pages = buffer.split('') + fh.seek(-offset-len(pages[-1]), os.SEEK_END) + fh.truncate + raise StopIteration + else: + yield lines[index] + yield segment def saveImageNames(config={}, images=[], session=None): """ Save image list in a file, including filename, url and uploader """ @@ -1033,7 +1083,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): xmlfiledesc = '' logerror( config=config, - text=u'The page "%s" was missing in the wiki (probably deleted)' % (title) + text=u'The page "%s" was missing in the wiki (probably deleted)' % (title.decode('utf-8')) ) f = open('%s/%s.desc' % (imagepath, filename2), 'w') @@ -1539,12 +1589,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None): def createNewDump(config={}, other={}): - titles = [] images = [] print 'Trying generating a new dump into a new directory...' if config['xml']: - titles += getPageTitles(config=config, session=other['session']) - saveTitles(config=config, titles=titles) + getPageTitles(config=config, session=other['session']) + titles=readTitles(config) generateXMLDump(config=config, titles=titles, session=other['session']) checkXMLIntegrity( config=config, @@ -1563,23 +1612,21 @@ def createNewDump(config={}, other={}): def resumePreviousDump(config={}, other={}): - titles = [] images = [] print 'Resuming previous dump process...' if config['xml']: - # load titles - lasttitle = '' + titles=readTitles(config) try: - f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix( - config=config, session=other['session']), config['date']), 'r') - raw = unicode(f.read(), 'utf-8') - titles = raw.split('\n') - lasttitle = titles[-1] - if not lasttitle: # empty line at EOF ? - lasttitle = titles[-2] - f.close() + lasttitles = reverse_readline('%s/%s-%s-titles.txt' % + ( config['path'], + domain2prefix( config=config, session=other['session'] ), + config['date']) + ) + lasttitle=lasttitles.next() + if lasttitle == '': + lasttitle=lasttitles.next() except: - pass # probably file doesnot exists + pass # probably file does not exists if lasttitle == '--END--': # titles list is complete print 'Title list was completed in the previous session' @@ -1587,13 +1634,13 @@ def resumePreviousDump(config={}, other={}): print 'Title list is incomplete. Reloading...' # do not resume, reload, to avoid inconsistences, deleted pages or # so - titles = getPageTitles(config=config, session=other['session']) - saveTitles(config=config, titles=titles) + getPageTitles(config=config, session=other['session']) + # checking xml dump xmliscomplete = False - lastxmltitle = '' + lastxmltitle = None try: - f = open( + f = reverse_readline( '%s/%s-%s-%s.xml' % (config['path'], domain2prefix( @@ -1601,27 +1648,26 @@ def resumePreviousDump(config={}, other={}): session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), - 'r') + ) for l in f: - if re.findall('', l): + if l == '': # xml dump is complete xmliscomplete = True break - # weird if found more than 1, but maybe - xmltitles = re.findall(r'([^<]+)', l) - if xmltitles: - lastxmltitle = undoHTMLEntities(text=xmltitles[-1]) - f.close() + + xmltitle = re.search(r'([^<]+)', l) + if xmltitle: + lastxmltitle = undoHTMLEntities(text=xmltitle.group(1)) + break except: - pass # probably file doesnot exists - # removing --END-- before getXMLs - while titles and titles[-1] in ['', '--END--']: - titles = titles[:-1] + pass # probably file does not exists + if xmliscomplete: print 'XML dump was completed in the previous session' elif lastxmltitle: # resuming... print 'Resuming XML dump from "%s"' % (lastxmltitle) + titles = readTitles(config, start=lastxmltitle) generateXMLDump( config=config, titles=titles, @@ -1630,6 +1676,7 @@ def resumePreviousDump(config={}, other={}): else: # corrupt? only has XML header? print 'XML is corrupt? Regenerating...' + titles = readTitles(config) generateXMLDump( config=config, titles=titles, session=other['session']) diff --git a/requirements.txt b/requirements.txt index c45cbc5..14a3769 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ argparse>=1.2.1 requests>=2.3.0 internetarchive +kitchen