xml file reading performance

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@81 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
pull/117/head
emijrp 13 years ago
parent 045420cfac
commit ff42a64776

@ -754,22 +754,31 @@ def main():
titles = getPageTitles(config=config)
saveTitles(config=config, titles=titles)
#checking xml dump
xml = ''
xmliscomplete = False
lastxmltitle = ''
try:
f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r')
xml = f.read()
for l in f:
if re.findall('</mediawiki>', l):
#xml dump is complete
xmliscomplete = True
break
xmltitles = re.findall(r'<title>([^<]+)</title>', l) #weird if found more than 1, but maybe
if xmltitles:
lastxmltitle = xmltitles[-1]
f.close()
except:
pass #probably file doesnot exists
if re.findall('</mediawiki>', xml):
#xml dump is complete
if xmliscomplete:
print 'XML dump was completed in the previous session'
else:
xmltitles = re.findall(r'<title>([^<]+)</title>', xml)
lastxmltitle = ''
if xmltitles:
lastxmltitle = xmltitles[-1]
elif lastxmltitle:
#resuming...
print 'Resuming XML dump from "%s"' % (lastxmltitle)
generateXMLDump(config=config, titles=titles, start=lastxmltitle)
else:
#corrupt? only has XML header?
print 'XML is corrupt? Regenerating...'
generateXMLDump(config=config, titles=titles)
if config['images']:
#load images
@ -838,6 +847,7 @@ def main():
saveLogs(config=config)
#save index.php as html, for license details at the bootom of the page
print 'Downloading index.php (Main Page)'
f = urllib.urlopen(config['index'])
raw = f.read()
raw = removeIP(raw=raw)
@ -846,6 +856,7 @@ def main():
f.close()
#save special:Version as html, for extensions details
print 'Downloading Special:Version with extensions and other related info'
f = urllib.urlopen('%s?title=Special:Version' % (config['index']))
raw = f.read()
raw = removeIP(raw=raw)

Loading…
Cancel
Save