From f4ec129bff28bd6c8cafefd592061e1ab646645f Mon Sep 17 00:00:00 2001
From: Benjamin Mako Hill <mako@atdot.cc>
Date: Tue, 10 Feb 2015 14:17:42 -0800
Subject: [PATCH] updated wikiadownloader.py to work with new dumps

Bitrot seems to have gotten the best of this script and it sounds like it
hasn't been used. This at least gets it to work by:

- find both .gz and the .7z dumps
- parse the new date format on html
- find dumps in the correct place
- move all chatter to stderr instead of stdout
---
 wikiadownloader.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/wikiadownloader.py b/wikiadownloader.py
index f719abd..a8b451f 100644
--- a/wikiadownloader.py
+++ b/wikiadownloader.py
@@ -42,40 +42,41 @@ f = open('wikia.com', 'r')
 wikia = f.read().strip().split('\n')
 f.close()
 
-print len(wikia), 'wikis in Wikia'
+print >>sys.stderr, len(wikia), 'wikis in Wikia'
 
 start = '!'
 if len(sys.argv) > 1:
     start = sys.argv[1]
 
 for wiki in wikia:
+    wiki = wiki.lower()
     prefix = wiki.split('http://')[1]
     if prefix < start:
         continue
-    print wiki
-    path = '%s/%s/%s' % (prefix[0], prefix[0:2], prefix)
+    print >>sys.stderr, "Starting:", wiki
     
     f = urllib.urlopen('%s/wiki/Special:Statistics' % (wiki))
     html = f.read()
-    #print html
     f.close()
     
-    m = re.compile(r'(?i)<a href="(?P<urldump>http://[^<>]+pages_(?P<dump>current|full)\.xml\.gz)">(?P<hour>\d\d:\d\d), (?P<month>[a-z]+) (?P<day>\d+), (?P<year>\d+)</a>').finditer(html)
-    for i in m:
+    m = re.compile(r'(?i)<a href="(?P<urldump>http://[^<>]+pages_(?P<dump>current|full)\.xml\.(?P<compression>gz|7z|bz2))">(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}) (?P<time>\d\d:\d\d:\d\d)')
+
+    for i in m.finditer(html):
         urldump = i.group("urldump")
         dump = i.group("dump")
+        date = "%s-%s-%s" % (i.group("year"), i.group("month"), i.group("day"))
+        compression = i.group("compression")
 
-        print 'Downloading', wiki
-        if not os.path.exists(path):
-            os.makedirs(path)
+        print >>sys.stderr, 'Downloading', wiki, dump.lower()
         
-        f = urllib.urlopen('%s/index.json' % ('/'.join(urldump.split('/')[:-1])))
-        json = f.read()
-        f.close()
         #{"name":"pages_full.xml.gz","timestamp":1273755409,"mwtimestamp":"20100513125649"}
         #{"name":"pages_current.xml.gz","timestamp":1270731925,"mwtimestamp":"20100408130525"}
-        date = re.findall(r'{"name":"pages_%s.xml.gz","timestamp":\d+,"mwtimestamp":"(\d{8})\d{6}"}' % (dump.lower()), json)[0]
-        print urldump, dump, date #, hour, month, day, year
         
         #-q, turn off verbose
-        os.system('wget -q -c "%s" -O %s/%s-%s-pages-meta-%s.gz' % (urldump, path, prefix, date, dump.lower() == 'current' and 'current' or 'history'))
+        os.system('wget -q -c "%s" -O %s-%s-pages-meta-%s.%s' % (urldump, prefix, date, dump.lower() == 'current' and 'current' or 'history', compression))
+
+    if not m.search(html):
+        print >>sys.stderr, 'Failed to download:', wiki
+        print >>sys.stderr, wiki
+
+fail_file.close()