diff --git a/dumpgenerator.py b/dumpgenerator.py index 5720076..14f8289 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -754,22 +754,31 @@ def main(): titles = getPageTitles(config=config) saveTitles(config=config, titles=titles) #checking xml dump - xml = '' + xmliscomplete = False + lastxmltitle = '' try: f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r') - xml = f.read() + for l in f: + if re.findall('', l): + #xml dump is complete + xmliscomplete = True + break + xmltitles = re.findall(r'([^<]+)', l) #weird if found more than 1, but maybe + if xmltitles: + lastxmltitle = xmltitles[-1] f.close() except: pass #probably file doesnot exists - if re.findall('', xml): - #xml dump is complete + if xmliscomplete: print 'XML dump was completed in the previous session' - else: - xmltitles = re.findall(r'([^<]+)', xml) - lastxmltitle = '' - if xmltitles: - lastxmltitle = xmltitles[-1] + elif lastxmltitle: + #resuming... + print 'Resuming XML dump from "%s"' % (lastxmltitle) generateXMLDump(config=config, titles=titles, start=lastxmltitle) + else: + #corrupt? only has XML header? + print 'XML is corrupt? Regenerating...' + generateXMLDump(config=config, titles=titles) if config['images']: #load images @@ -838,6 +847,7 @@ def main(): saveLogs(config=config) #save index.php as html, for license details at the bootom of the page + print 'Downloading index.php (Main Page)' f = urllib.urlopen(config['index']) raw = f.read() raw = removeIP(raw=raw) @@ -846,6 +856,7 @@ def main(): f.close() #save special:Version as html, for extensions details + print 'Downloading Special:Version with extensions and other related info' f = urllib.urlopen('%s?title=Special:Version' % (config['index'])) raw = f.read() raw = removeIP(raw=raw)