From f4ec129bff28bd6c8cafefd592061e1ab646645f Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Tue, 10 Feb 2015 14:17:42 -0800 Subject: [PATCH] updated wikiadownloader.py to work with new dumps Bitrot seems to have gotten the best of this script and it sounds like it hasn't been used. This at least gets it to work by: - find both .gz and the .7z dumps - parse the new date format on html - find dumps in the correct place - move all chatter to stderr instead of stdout --- wikiadownloader.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/wikiadownloader.py b/wikiadownloader.py index f719abd..a8b451f 100644 --- a/wikiadownloader.py +++ b/wikiadownloader.py @@ -42,40 +42,41 @@ f = open('wikia.com', 'r') wikia = f.read().strip().split('\n') f.close() -print len(wikia), 'wikis in Wikia' +print >>sys.stderr, len(wikia), 'wikis in Wikia' start = '!' if len(sys.argv) > 1: start = sys.argv[1] for wiki in wikia: + wiki = wiki.lower() prefix = wiki.split('http://')[1] if prefix < start: continue - print wiki - path = '%s/%s/%s' % (prefix[0], prefix[0:2], prefix) + print >>sys.stderr, "Starting:", wiki f = urllib.urlopen('%s/wiki/Special:Statistics' % (wiki)) html = f.read() - #print html f.close() - m = re.compile(r'(?i)(?P\d\d:\d\d), (?P[a-z]+) (?P\d+), (?P\d+)').finditer(html) - for i in m: + m = re.compile(r'(?i)(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P