Merge pull request #217 from makoshark/master

fix bug with exception handling
pull/218/head
PiRSquared17 9 years ago
commit d248b3f3e8

@ -46,7 +46,11 @@ import urllib
__VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org
class PageMissingError(Exception):
pass
def __init__(self, title, xml):
self.title = title
self.xml = xml
def __str__(self):
return "page '%s' not found" % self.title
def getVersion():
return(__VERSION__)
@ -391,8 +395,11 @@ def getXMLHeader(config={}, session=None):
randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
header = xml.split('</mediawiki>')[0]
except PageMissingError:
except PageMissingError as pme:
xml = pme.xml
header = xml.split('</mediawiki>')[0]
if not re.match("<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.'
sys.exit()
return header
@ -507,8 +514,8 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
params['templates'] = 1
xml = getXMLPageCore(params=params, config=config, session=session)
if not xml:
raise PageMissingError
if not "</page>" in xml:
raise PageMissingError(params['title'], xml)
else:
# strip these sha1s sums which keep showing up in the export and
# which are invalid for the XML schema (they only apply to

@ -42,40 +42,41 @@ f = open('wikia.com', 'r')
wikia = f.read().strip().split('\n')
f.close()
print len(wikia), 'wikis in Wikia'
print >>sys.stderr, len(wikia), 'wikis in Wikia'
start = '!'
if len(sys.argv) > 1:
start = sys.argv[1]
for wiki in wikia:
wiki = wiki.lower()
prefix = wiki.split('http://')[1]
if prefix < start:
continue
print wiki
path = '%s/%s/%s' % (prefix[0], prefix[0:2], prefix)
print >>sys.stderr, "Starting:", wiki
f = urllib.urlopen('%s/wiki/Special:Statistics' % (wiki))
html = f.read()
#print html
f.close()
m = re.compile(r'(?i)<a href="(?P<urldump>http://[^<>]+pages_(?P<dump>current|full)\.xml\.gz)">(?P<hour>\d\d:\d\d), (?P<month>[a-z]+) (?P<day>\d+), (?P<year>\d+)</a>').finditer(html)
for i in m:
m = re.compile(r'(?i)<a href="(?P<urldump>http://[^<>]+pages_(?P<dump>current|full)\.xml\.(?P<compression>gz|7z|bz2))">(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}) (?P<time>\d\d:\d\d:\d\d)')
for i in m.finditer(html):
urldump = i.group("urldump")
dump = i.group("dump")
date = "%s-%s-%s" % (i.group("year"), i.group("month"), i.group("day"))
compression = i.group("compression")
print 'Downloading', wiki
if not os.path.exists(path):
os.makedirs(path)
print >>sys.stderr, 'Downloading', wiki, dump.lower()
f = urllib.urlopen('%s/index.json' % ('/'.join(urldump.split('/')[:-1])))
json = f.read()
f.close()
#{"name":"pages_full.xml.gz","timestamp":1273755409,"mwtimestamp":"20100513125649"}
#{"name":"pages_current.xml.gz","timestamp":1270731925,"mwtimestamp":"20100408130525"}
date = re.findall(r'{"name":"pages_%s.xml.gz","timestamp":\d+,"mwtimestamp":"(\d{8})\d{6}"}' % (dump.lower()), json)[0]
print urldump, dump, date #, hour, month, day, year
#-q, turn off verbose
os.system('wget -q -c "%s" -O %s/%s-%s-pages-meta-%s.gz' % (urldump, path, prefix, date, dump.lower() == 'current' and 'current' or 'history'))
os.system('wget -q -c "%s" -O %s-%s-pages-meta-%s.%s' % (urldump, prefix, date, dump.lower() == 'current' and 'current' or 'history', compression))
if not m.search(html):
print >>sys.stderr, 'Failed to download:', wiki
print >>sys.stderr, wiki
fail_file.close()

Loading…
Cancel
Save