#!/usr/bin/env python2 # -*- coding: utf-8 -*- # Copyright (C) 2011 WikiTeam # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # using a list of wikia subdomains, it downloads all dumps available in Special:Statistics pages # you can use the list available at the "listofwikis" directory, the file is called wikia.com and it contains +200k wikis import datetime import os import re import sys import urllib """ instructions: it requires a list of wikia wikis there is one in the repository (listofwikis directory) run it: python wikiadownloader.py it you want to resume: python wikiadownloader.py wikitostartfrom where wikitostartfrom is the last downloaded wiki in the previous session """ f = open('wikia.com', 'r') wikia = f.read().strip().split('\n') f.close() print >>sys.stderr, len(wikia), 'wikis in Wikia' start = '!' if len(sys.argv) > 1: start = sys.argv[1] for wiki in wikia: wiki = wiki.lower() prefix = wiki.split('http://')[1] if prefix < start: continue print >>sys.stderr, "Starting:", wiki f = urllib.urlopen('%s/wiki/Special:Statistics' % (wiki)) html = f.read() f.close() m = re.compile(r'(?i)(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P