Add wikia.py, to list Wikia wikis we'll dump ourselves

2024-11-16 21:27:46 +00:00 · 2014-12-17 22:49:10 +01:00 · 2014-12-17 22:49:10 +01:00 · 8bd3373960
commit 8bd3373960
parent 38e778faad
1 changed files with 72 additions and 0 deletions
--- a/listsofwikis/mediawiki/wikia.py
+++ b/listsofwikis/mediawiki/wikia.py
@ -0,0 +1,72 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 # wikia.py List of not archived Wikia wikis
 # Downloads Wikia's dumps and lists wikis which have none.
 # TODO: check date
 #
 # Copyright (C) 2014 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import subprocess
 import re
 from wikitools import wiki, api
 def getlist(wikia, wkfrom = 1, wkto = 1000):
    params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,}
    request = api.APIRequest(wikia, params)
    return request.query()['query']['wkdomains']
 def getall():
    wikia = wiki.Wiki('http://community.wikia.com/api.php')
    offset = 0
    limit = 1000
    domains = {}
    while True:
        list = getlist(wikia, offset, limit)
        if list:
            domains = dict(domains.items() + list.items() )
            offset += 1000
        else:
            break
    return domains
 def main():
    domains = getall()
    undumped = []
    for i in domains:
        #print domains
        dbname = domains[i]['domain'].replace('.wikia.com', '').translate('-_.')
        dbname = re.escape(dbname)
        base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + dbname[0] + '/' \
            + dbname[0] + dbname[1] + '/' + dbname
        full = base + '_pages_full.xml.gz'
        current = base + '_pages_current.xml.gz'
        images = base + '_images.tar'
        try:
            subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', full])
            # Use this instead, and comment out the next try, to only list.
            #subprocess.check_call(['curl', '-I', full])
        except:
            undumped += dbname
        try:
            subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', current])
            subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
        except:
            pass
    print undumped
 if __name__ == '__main__':
    main()