diff --git a/batchdownload/uploader.py b/batchdownload/uploader.py
deleted file mode 100644
index 4ae3e07..0000000
--- a/batchdownload/uploader.py
+++ /dev/null
@@ -1,301 +0,0 @@
-#!/usr/bin/env python2
-# -*- coding: utf-8 -*-
-
-# Copyright (C) 2011-2016 WikiTeam
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-
-import getopt
-import os
-import re
-import subprocess
-import sys
-import time
-import urllib
-import urllib2
-import urlparse
-import StringIO
-from xml.sax.saxutils import quoteattr
-from internetarchive import get_item
-
-import dumpgenerator
-
-# Configuration goes here
-# You need a file named keys.txt with access and secret keys, in two different lines
-accesskey = open('keys.txt', 'r').readlines()[0].strip()
-secretkey = open('keys.txt', 'r').readlines()[1].strip()
-# Use --admin if you are a wikiteam collection admin, or specify another collection:
-collection = 'opensource'
-
-# Nothing to change below
-convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
-listfile = sys.argv[1]
-uploadeddumps = []
-try:
- uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
-except:
- pass
-print '%d dumps uploaded previously' % (len(uploadeddumps))
-
-def getParameters(params=[]):
- if not params:
- params = sys.argv[2:]
- config = {
- 'prune-directories': False,
- 'prune-wikidump': False,
- 'collection': collection,
- 'update': False,
- }
- #console params
- try:
- opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
- except getopt.GetoptError, err:
- # print help information and exit:
- print str(err) # will print something like "option -a not recognized"
- usage()
- sys.exit(2)
- for o, a in opts:
- if o in ("-h","--help"):
- usage()
- sys.exit()
- elif o in ("--prune-directories"):
- config['prune-directories'] = True
- elif o in ("--prune-wikidump"):
- config['prune-wikidump'] = True
- elif o in ("--admin"):
- config['collection'] = "wikiteam"
- elif o in ("--update"):
- config['update'] = True
- return config
-
-def usage():
- """ """
- print """uploader.py
-
-This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
-The list must be a text file with the wiki's api.php URLs, one per line.
-Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
-as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
-You need a file named keys.txt with access and secret keys, in two different lines
-You also need dumpgenerator.py in the same directory as this script.
-
-Use --help to print this help."""
-
-def log(wiki, dump, msg):
- f = open('uploader-%s.log' % (listfile), 'a')
- f.write('\n%s;%s;%s' % (wiki, dump, msg))
- f.close()
-
-def upload(wikis, config={}):
- headers = {'User-Agent': dumpgenerator.getUserAgent()}
-
- for wiki in wikis:
- print "#"*73
- print "# Uploading", wiki
- print "#"*73
- wiki = wiki.lower()
- prefix = dumpgenerator.domain2prefix(config={'api': wiki})
-
- wikiname = prefix.split('-')[0]
- dumps = []
- for dirname, dirnames, filenames in os.walk('.'):
- if dirname == '.':
- for f in filenames:
- if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
- dumps.append(f)
- break
-
- c = 0
- for dump in dumps:
- wikidate = dump.split('-')[1]
- item = get_item('wiki-' + wikiname)
- if dump in uploadeddumps:
- if config['prune-directories']:
- rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
- # With -f the deletion might have happened before and we won't know
- if not os.system(rmline):
- print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
- if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
- # Simplistic quick&dirty check for the presence of this file in the item
- stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
- dumphash = re.sub(' +.+\n?', '', stdout)
-
- if dumphash in map(lambda x: x['md5'], item.files):
- log(wiki, dump, 'verified')
- rmline='rm -rf %s' % dump
- if not os.system(rmline):
- print 'DELETED ' + dump
- print '%s was uploaded before, skipping...' % (dump)
- continue
- else:
- print 'ERROR: The online item misses ' + dump
- log(wiki, dump, 'missing')
- # We'll exit this if and go upload the dump
- else:
- print '%s was uploaded before, skipping...' % (dump)
- continue
-
- time.sleep(0.1)
- wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
- print wiki, wikiname, wikidate, dump
-
- # Does the item exist already?
- ismissingitem = not item.exists
-
- # Logo path
- logourl = ''
-
- if ismissingitem or config['update']:
- #get metadata from api.php
- #first sitename and base url
- params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
- data = urllib.urlencode(params)
- req = urllib2.Request(url=wiki, data=data, headers=headers)
- xml = ''
- try:
- f = urllib2.urlopen(req)
- xml = f.read()
- f.close()
- except:
- pass
-
- sitename = ''
- baseurl = ''
- lang = ''
- try:
- sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
- except:
- pass
- try:
- baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
- except:
- pass
- try:
- lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
- except:
- pass
-
- if not sitename:
- sitename = wikiname
- if not baseurl:
- baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
- if lang:
- lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower()
-
- #now copyright info from API
- params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
- data = urllib.urlencode(params)
- req = urllib2.Request(url=wiki, data=data, headers=headers)
- xml = ''
- try:
- f = urllib2.urlopen(req)
- xml = f.read()
- f.close()
- except:
- pass
-
- rightsinfourl = ''
- rightsinfotext = ''
- try:
- rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0]
- rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
- except:
- pass
-
- raw = ''
- try:
- f = urllib.urlopen(baseurl)
- raw = f.read()
- f.close()
- except:
- pass
-
- #or copyright info from #footer in mainpage
- if baseurl and not rightsinfourl and not rightsinfotext:
- rightsinfotext = ''
- rightsinfourl = ''
- try:
- rightsinfourl = re.findall(ur"", raw)[0]
- except:
- pass
- try:
- rightsinfotext = re.findall(ur"
([^\n\r]*?)
", raw)[0]
- except:
- pass
- if rightsinfotext and not rightsinfourl:
- rightsinfourl = baseurl + '#footer'
- try:
- logourl = re.findall(ur'p-logo["\'][^>]*>\s*]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
- except:
- pass
- print logourl
-
- #retrieve some info from the wiki
- wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
- wikidesc = "%s dumped with WikiTeam tools." % (baseurl, sitename)# "ECGpedia,: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with WikiTeam tools."
- wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
- if not rightsinfourl and not rightsinfotext:
- wikikeys.append('unknowncopyright')
-
- wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
- wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
- wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
- else:
- print 'Item already exists.'
- lang = 'foo'
- wikititle = 'foo'
- wikidesc = 'foo'
- wikikeys = 'foo'
- wikilicenseurl = 'foo'
- wikirights = 'foo'
- wikiurl = 'foo'
-
- if c == 0:
- # Item metadata
- md = {
- 'mediatype': 'web',
- 'collection': config['collection'],
- 'title': wikititle,
- 'description': wikidesc,
- 'language': lang,
- 'last-updated-date': wikidate_text,
- 'subject': '; '.join(wikikeys), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
- 'licenseurl': wikilicenseurl and urlparse.urljoin(wiki, wikilicenseurl),
- 'rights': wikirights,
- 'originalurl': wikiurl,
- }
-
- #Upload files and update metadata
- try:
- item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
- item.modify_metadata(md) # update
- print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
- if logourl:
- logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
- logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
- logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
- item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
- uploadeddumps.append(dump)
- log(wiki, dump, 'ok')
- except:
- print wiki, dump, 'error when uploading?'
-
- c += 1
-
-def main(params=[]):
- config = getParameters(params=params)
- wikis = open(listfile, 'r').read().strip().splitlines()
- upload(wikis, config)
-
-if __name__ == "__main__":
- main()