|
|
|
@ -16,6 +16,7 @@
|
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
import getopt
|
|
|
|
|
import argparse
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import subprocess
|
|
|
|
@ -30,89 +31,41 @@ from internetarchive import get_item
|
|
|
|
|
|
|
|
|
|
import dumpgenerator
|
|
|
|
|
|
|
|
|
|
# Configuration goes here
|
|
|
|
|
# You need a file named keys.txt with access and secret keys, in two different lines
|
|
|
|
|
accesskey = open('keys.txt', 'r').readlines()[0].strip()
|
|
|
|
|
secretkey = open('keys.txt', 'r').readlines()[1].strip()
|
|
|
|
|
# Use --admin if you are a wikiteam collection admin, or specify another collection:
|
|
|
|
|
collection = 'opensource'
|
|
|
|
|
|
|
|
|
|
# Nothing to change below
|
|
|
|
|
convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
|
|
|
|
|
listfile = sys.argv[1]
|
|
|
|
|
uploadeddumps = []
|
|
|
|
|
try:
|
|
|
|
|
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
print '%d dumps uploaded previously' % (len(uploadeddumps))
|
|
|
|
|
|
|
|
|
|
def getParameters(params=[]):
|
|
|
|
|
if not params:
|
|
|
|
|
params = sys.argv[2:]
|
|
|
|
|
config = {
|
|
|
|
|
'prune-directories': False,
|
|
|
|
|
'prune-wikidump': False,
|
|
|
|
|
'collection': collection,
|
|
|
|
|
'update': False,
|
|
|
|
|
}
|
|
|
|
|
#console params
|
|
|
|
|
try:
|
|
|
|
|
opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
|
|
|
|
|
except getopt.GetoptError, err:
|
|
|
|
|
# print help information and exit:
|
|
|
|
|
print str(err) # will print something like "option -a not recognized"
|
|
|
|
|
usage()
|
|
|
|
|
sys.exit(2)
|
|
|
|
|
for o, a in opts:
|
|
|
|
|
if o in ("-h","--help"):
|
|
|
|
|
usage()
|
|
|
|
|
sys.exit()
|
|
|
|
|
elif o in ("--prune-directories"):
|
|
|
|
|
config['prune-directories'] = True
|
|
|
|
|
elif o in ("--prune-wikidump"):
|
|
|
|
|
config['prune-wikidump'] = True
|
|
|
|
|
elif o in ("--admin"):
|
|
|
|
|
config['collection'] = "wikiteam"
|
|
|
|
|
elif o in ("--update"):
|
|
|
|
|
config['update'] = True
|
|
|
|
|
return config
|
|
|
|
|
|
|
|
|
|
def usage():
|
|
|
|
|
""" """
|
|
|
|
|
print """uploader.py
|
|
|
|
|
|
|
|
|
|
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
|
|
|
|
|
The list must be a text file with the wiki's api.php URLs, one per line.
|
|
|
|
|
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
|
|
|
|
|
as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
|
|
|
|
|
You need a file named keys.txt with access and secret keys, in two different lines
|
|
|
|
|
You also need dumpgenerator.py in the same directory as this script.
|
|
|
|
|
|
|
|
|
|
Use --help to print this help."""
|
|
|
|
|
|
|
|
|
|
def log(wiki, dump, msg):
|
|
|
|
|
f = open('uploader-%s.log' % (listfile), 'a')
|
|
|
|
|
def log(wiki, dump, msg, config={}):
|
|
|
|
|
f = open('uploader-%s.log' % (config.listfile), 'a')
|
|
|
|
|
f.write('\n%s;%s;%s' % (wiki, dump, msg))
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
def upload(wikis, config={}):
|
|
|
|
|
def upload(wikis, config={}, uploadeddumps=[]):
|
|
|
|
|
headers = {'User-Agent': dumpgenerator.getUserAgent()}
|
|
|
|
|
dumpdir = config.wikidump_dir
|
|
|
|
|
|
|
|
|
|
filelist = os.listdir(dumpdir)
|
|
|
|
|
for wiki in wikis:
|
|
|
|
|
print "#"*73
|
|
|
|
|
print "# Uploading", wiki
|
|
|
|
|
print "#"*73
|
|
|
|
|
wiki = wiki.lower()
|
|
|
|
|
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
|
|
|
|
|
configtemp = config
|
|
|
|
|
try:
|
|
|
|
|
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
|
|
|
|
|
except KeyError:
|
|
|
|
|
print "ERROR: could not produce the prefix for %s" % wiki
|
|
|
|
|
config = configtemp
|
|
|
|
|
|
|
|
|
|
wikiname = prefix.split('-')[0]
|
|
|
|
|
dumps = []
|
|
|
|
|
for dirname, dirnames, filenames in os.walk('.'):
|
|
|
|
|
if dirname == '.':
|
|
|
|
|
for f in filenames:
|
|
|
|
|
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
|
|
|
|
|
dumps.append(f)
|
|
|
|
|
for f in filelist:
|
|
|
|
|
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
|
|
|
|
|
print "%s found" % f
|
|
|
|
|
dumps.append(f)
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
c = 0
|
|
|
|
@ -120,30 +73,33 @@ def upload(wikis, config={}):
|
|
|
|
|
wikidate = dump.split('-')[1]
|
|
|
|
|
item = get_item('wiki-' + wikiname)
|
|
|
|
|
if dump in uploadeddumps:
|
|
|
|
|
if config['prune-directories']:
|
|
|
|
|
if config.prune_directories:
|
|
|
|
|
rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
|
|
|
|
|
# With -f the deletion might have happened before and we won't know
|
|
|
|
|
if not os.system(rmline):
|
|
|
|
|
print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
|
|
|
|
|
if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
|
|
|
|
|
if config.prune_wikidump and dump.endswith('wikidump.7z'):
|
|
|
|
|
# Simplistic quick&dirty check for the presence of this file in the item
|
|
|
|
|
stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
|
|
|
|
|
print "Checking content in previously uploaded files"
|
|
|
|
|
stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
|
|
|
|
|
dumphash = re.sub(' +.+\n?', '', stdout)
|
|
|
|
|
|
|
|
|
|
if dumphash in map(lambda x: x['md5'], item.files):
|
|
|
|
|
log(wiki, dump, 'verified')
|
|
|
|
|
rmline='rm -rf %s' % dump
|
|
|
|
|
log(wiki, dump, 'verified', config)
|
|
|
|
|
rmline='rm -rf %s' % dumpdir + '/' + dump
|
|
|
|
|
if not os.system(rmline):
|
|
|
|
|
print 'DELETED ' + dump
|
|
|
|
|
print 'DELETED ' + dumpdir + '/' + dump
|
|
|
|
|
print '%s was uploaded before, skipping...' % (dump)
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
print 'ERROR: The online item misses ' + dump
|
|
|
|
|
log(wiki, dump, 'missing')
|
|
|
|
|
log(wiki, dump, 'missing', config)
|
|
|
|
|
# We'll exit this if and go upload the dump
|
|
|
|
|
else:
|
|
|
|
|
print '%s was uploaded before, skipping...' % (dump)
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
print '%s was not uploaded before' % dump
|
|
|
|
|
|
|
|
|
|
time.sleep(0.1)
|
|
|
|
|
wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
|
|
|
|
@ -155,7 +111,7 @@ def upload(wikis, config={}):
|
|
|
|
|
# Logo path
|
|
|
|
|
logourl = ''
|
|
|
|
|
|
|
|
|
|
if ismissingitem or config['update']:
|
|
|
|
|
if ismissingitem or config.update:
|
|
|
|
|
#get metadata from api.php
|
|
|
|
|
#first sitename and base url
|
|
|
|
|
params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
|
|
|
|
@ -163,7 +119,7 @@ def upload(wikis, config={}):
|
|
|
|
|
req = urllib2.Request(url=wiki, data=data, headers=headers)
|
|
|
|
|
xml = ''
|
|
|
|
|
try:
|
|
|
|
|
f = urllib2.urlopen(req)
|
|
|
|
|
f = urllib2.urlopen(req, timeout=10)
|
|
|
|
|
xml = f.read()
|
|
|
|
|
f.close()
|
|
|
|
|
except:
|
|
|
|
@ -198,7 +154,7 @@ def upload(wikis, config={}):
|
|
|
|
|
req = urllib2.Request(url=wiki, data=data, headers=headers)
|
|
|
|
|
xml = ''
|
|
|
|
|
try:
|
|
|
|
|
f = urllib2.urlopen(req)
|
|
|
|
|
f = urllib2.urlopen(req, timeout=10)
|
|
|
|
|
xml = f.read()
|
|
|
|
|
f.close()
|
|
|
|
|
except:
|
|
|
|
@ -214,7 +170,7 @@ def upload(wikis, config={}):
|
|
|
|
|
|
|
|
|
|
raw = ''
|
|
|
|
|
try:
|
|
|
|
|
f = urllib.urlopen(baseurl)
|
|
|
|
|
f = urllib.urlopen(baseurl, timeout=10)
|
|
|
|
|
raw = f.read()
|
|
|
|
|
f.close()
|
|
|
|
|
except:
|
|
|
|
@ -238,7 +194,6 @@ def upload(wikis, config={}):
|
|
|
|
|
logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
print logourl
|
|
|
|
|
|
|
|
|
|
#retrieve some info from the wiki
|
|
|
|
|
wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
|
|
|
|
@ -264,7 +219,7 @@ def upload(wikis, config={}):
|
|
|
|
|
# Item metadata
|
|
|
|
|
md = {
|
|
|
|
|
'mediatype': 'web',
|
|
|
|
|
'collection': config['collection'],
|
|
|
|
|
'collection': config.collection,
|
|
|
|
|
'title': wikititle,
|
|
|
|
|
'description': wikidesc,
|
|
|
|
|
'language': lang,
|
|
|
|
@ -277,25 +232,52 @@ def upload(wikis, config={}):
|
|
|
|
|
|
|
|
|
|
#Upload files and update metadata
|
|
|
|
|
try:
|
|
|
|
|
item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
|
|
|
|
|
item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
|
|
|
|
|
item.modify_metadata(md) # update
|
|
|
|
|
print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
|
|
|
|
|
uploadeddumps.append(dump)
|
|
|
|
|
log(wiki, dump, 'ok', config)
|
|
|
|
|
if logourl:
|
|
|
|
|
logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
|
|
|
|
|
logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read())
|
|
|
|
|
logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
|
|
|
|
|
logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
|
|
|
|
|
item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
|
|
|
|
|
uploadeddumps.append(dump)
|
|
|
|
|
log(wiki, dump, 'ok')
|
|
|
|
|
except:
|
|
|
|
|
print wiki, dump, 'error when uploading?'
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print wiki, dump, 'Error when uploading?'
|
|
|
|
|
print e.message
|
|
|
|
|
|
|
|
|
|
c += 1
|
|
|
|
|
|
|
|
|
|
def main(params=[]):
|
|
|
|
|
config = getParameters(params=params)
|
|
|
|
|
parser = argparse.ArgumentParser("""uploader.py
|
|
|
|
|
|
|
|
|
|
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
|
|
|
|
|
The list must be a text file with the wiki's api.php URLs, one per line.
|
|
|
|
|
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
|
|
|
|
|
as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
|
|
|
|
|
You need a file named keys.txt with access and secret keys, in two different lines
|
|
|
|
|
You also need dumpgenerator.py in the same directory as this script.
|
|
|
|
|
|
|
|
|
|
Use --help to print this help.""")
|
|
|
|
|
|
|
|
|
|
parser.add_argument('-pd', '--prune_directories', action='store_true')
|
|
|
|
|
parser.add_argument('-pw', '--prune_wikidump', action='store_true')
|
|
|
|
|
parser.add_argument('-a', '--admin', action='store_true')
|
|
|
|
|
parser.add_argument('-c', '--collection', default='opensource')
|
|
|
|
|
parser.add_argument('-wd', '--wikidump_dir', default='.')
|
|
|
|
|
parser.add_argument('-u', '--update', action='store_true')
|
|
|
|
|
parser.add_argument('listfile')
|
|
|
|
|
config = parser.parse_args()
|
|
|
|
|
uploadeddumps = []
|
|
|
|
|
listfile = config.listfile
|
|
|
|
|
try:
|
|
|
|
|
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
print '%d dumps uploaded previously' % (len(uploadeddumps))
|
|
|
|
|
wikis = open(listfile, 'r').read().strip().splitlines()
|
|
|
|
|
upload(wikis, config)
|
|
|
|
|
|
|
|
|
|
upload(wikis, config, uploadeddumps)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|
|
|
|
|