From 8bd33739608e1a9dbc0c710870d1f77b54660e24 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Wed, 17 Dec 2014 22:49:10 +0100 Subject: [PATCH] Add wikia.py, to list Wikia wikis we'll dump ourselves --- listsofwikis/mediawiki/wikia.py | 72 +++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 listsofwikis/mediawiki/wikia.py diff --git a/listsofwikis/mediawiki/wikia.py b/listsofwikis/mediawiki/wikia.py new file mode 100644 index 0000000..7ab2b03 --- /dev/null +++ b/listsofwikis/mediawiki/wikia.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- + +# wikia.py List of not archived Wikia wikis +# Downloads Wikia's dumps and lists wikis which have none. +# TODO: check date +# +# Copyright (C) 2014 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import subprocess +import re +from wikitools import wiki, api + +def getlist(wikia, wkfrom = 1, wkto = 1000): + params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,} + request = api.APIRequest(wikia, params) + return request.query()['query']['wkdomains'] + +def getall(): + wikia = wiki.Wiki('http://community.wikia.com/api.php') + offset = 0 + limit = 1000 + domains = {} + while True: + list = getlist(wikia, offset, limit) + if list: + domains = dict(domains.items() + list.items() ) + offset += 1000 + else: + break + return domains + +def main(): + domains = getall() + undumped = [] + for i in domains: + #print domains + dbname = domains[i]['domain'].replace('.wikia.com', '').translate('-_.') + dbname = re.escape(dbname) + base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + dbname[0] + '/' \ + + dbname[0] + dbname[1] + '/' + dbname + full = base + '_pages_full.xml.gz' + current = base + '_pages_current.xml.gz' + images = base + '_images.tar' + try: + subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', full]) + # Use this instead, and comment out the next try, to only list. + #subprocess.check_call(['curl', '-I', full]) + except: + undumped += dbname + + try: + subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', current]) + subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images]) + except: + pass + print undumped + +if __name__ == '__main__': + main()