diff --git a/uploader.py b/uploader.py index f7427e3..61b9b8a 100644 --- a/uploader.py +++ b/uploader.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -# Copyright (C) 2011-2012 WikiTeam +# Copyright (C) 2011-2014 WikiTeam # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -15,28 +15,8 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -# uploader.py -# This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. -# The list must be a text file with the wiki's api.php URLs, one per line. -# Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format -# as produced by launcher.py (explained in https://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump ). -# Adjust your configuration; see below "Configuration goes here". -# You also need dumpgenerator.py in the same directory as this script. - -# Developing scratchpad -# Keys: http://archive.org/account/s3.php -# Documentation: http://archive.org/help/abouts3.txt -# https://github.com/kngenie/ias3upload -# http://en.ecgpedia.org/api.php?action=query&meta=siteinfo&siprop=general|rightsinfo&format=xml -# -# TODO: bug - upload may (partly) fail if two (small) files are sent to s3 without pause http://p.defau.lt/?puN_G_zKXbv1lz9TfSliPg http://archive.org/details/wiki-editionorg_w or something http://p.defau.lt/?udwrG7YQn4RK_1whl1XWRw http://archive.org/details/wiki-jutedreamhosterscom_lineageii_bestiary -# TODO: bug - translate relative copyright URLs as in http://archive.org/details/wiki-wikipovrayorg now linking http://archive.org/content/POV-Wiki:Copyrights -# TODO: minor bug - skip sites requiring authentication without asking user input (e.g. ilab.usc.edu) -# TODO: minor bug - don't overwrite existing files with same filename in the same identifier -# TODO: trivial bug - check for duplicates with originalurl http://archive.org/details/wiki-enecgpediaorg http://archive.org/details/wiki-en.ecgpedia.org -# TODO: enhancement - download wiki logo and upload as *-logo.png, should suffice to get the thumb used -# TODO: enhancement - fix escapement? http://archive.org/details/wiki-encitizendiumorg - +import getopt +import hashlib import os import re import subprocess @@ -52,8 +32,8 @@ import dumpgenerator # You need a file named keys.txt with access and secret keys, in two different lines accesskey = open('keys.txt', 'r').readlines()[0].strip() secretkey = open('keys.txt', 'r').readlines()[1].strip() -collection = 'wikiteam' # Replace with "opensource" if you are not an admin of the collection -# end configuration +# Use --admin if you are a wikiteam collection admin, or specify another collection: +collection = 'opensource' # Nothing to change below convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'} @@ -65,12 +45,51 @@ except: pass print '%d dumps uploaded previously' % (len(uploadeddumps)) +def getParameters(params=[]): + if not params: + params = sys.argv[2:] + config = { + 'prune-directories': False, + 'prune-wikidump': False, + 'collection': collection + } + #console params + try: + opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin"]) + except getopt.GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + sys.exit(2) + for o, a in opts: + if o in ("-h","--help"): + usage() + sys.exit() + elif o in ("--prune-directories"): + config['prune-directories'] = True + elif o in ("--prune-wikidump"): + config['prune-wikidump'] = True + elif o in ("--admin"): + config['collection'] = "wikiteam" + return config + +def usage(): + """ """ + print """uploader.py +This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. +The list must be a text file with the wiki's api.php URLs, one per line. +Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format +as produced by launcher.py (explained in https://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump ). +You need a file named keys.txt with access and secret keys, in two different lines +You also need dumpgenerator.py in the same directory as this script. +Use --help to print this help.""" + def log(wiki, dump, msg): f = open('uploader-%s.log' % (listfile), 'a') f.write('\n%s;%s;%s' % (wiki, dump, msg)) f.close() -def upload(wikis): +def upload(wikis, config={}): for wiki in wikis: print "#"*73 print "# Uploading", wiki @@ -89,12 +108,32 @@ def upload(wikis): c = 0 for dump in dumps: + wikidate = dump.split('-')[1] if dump in uploadeddumps: + if config['prune-directories']: + rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate) + # With -f the deletion might have happened before and we won't know + if not os.system(rmline): + print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) + if config['prune-wikidump'] and dump.endswith('wikidump.7z'): + # Simplistic quick&dirty to check for the presence of this file in the item + stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + dumphash = re.sub(' +.+\n?', '', stdout) + + headers = {'User-Agent': dumpgenerator.getUserAgent()} + itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers) + if re.search(dumphash, urllib2.urlopen(itemdata).read()): + log(wiki, dump, 'verified') + rmline='rm -rf %s' % dump + if not os.system(rmline): + print 'DELETED ' + dump + else: + print 'ERROR: The online item misses ' + dump + log(wiki, dump, 'missing') print '%s was uploaded before, skipping...' % (dump) continue time.sleep(0.1) - wikidate = dump.split('-')[1] wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] print wiki, wikiname, wikidate, dump @@ -198,6 +237,7 @@ def upload(wikis): wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php else: + print 'Item already exists.' lang = 'foo' wikititle = 'foo' wikidesc = 'foo' @@ -215,7 +255,7 @@ def upload(wikis): ] if c == 0: curl += ['--header', "'x-archive-meta-mediatype:web'", - '--header', "'x-archive-meta-collection:%s'" % (collection), + '--header', "'x-archive-meta-collection:%s'" % (config['collection']), '--header', quoteattr('x-archive-meta-title:' + wikititle), '--header', "'x-archive-meta-description:%s'" % wikidesc.replace("'", r"\'"), '--header', quoteattr('x-archive-meta-language:' + lang), @@ -250,9 +290,11 @@ def upload(wikis): os.system(curlmetaline) c += 1 -def main(): +def main(params=[]): + + config = getParameters(params=params) wikis = open(listfile, 'r').read().strip().splitlines() - upload(wikis) + upload(wikis, config) if __name__ == "__main__": main()