#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2011 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# using a list of wikia subdomains, it downloads all dumps available in Special:Statistics pages
# you can use the list available at the "listofwikis" directory, the file is called wikia.com and it contains +200k wikis
import datetime
import os
import re
import sys
import urllib
"""
instructions:
it requires a list of wikia wikis
there is one in the repository (listofwikis directory)
run it: python wikiadownloader.py
it you want to resume: python wikiadownloader.py wikitostartfrom
where wikitostartfrom is the last downloaded wiki in the previous session
"""
f = open('wikia.com', 'r')
wikia = f.read().strip().split('\n')
f.close()
print >>sys.stderr, len(wikia), 'wikis in Wikia'
start = '!'
if len(sys.argv) > 1:
start = sys.argv[1]
for wiki in wikia:
wiki = wiki.lower()
prefix = wiki.split('http://')[1]
if prefix < start:
continue
print >>sys.stderr, "Starting:", wiki
f = urllib.urlopen('%s/wiki/Special:Statistics' % (wiki))
html = f.read()
f.close()
m = re.compile(r'(?i)(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P