Revamp uploader.py for wider usage: move issues to tracker; add options --help, --prune-directories, --prune-wikidump, --admin

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@940 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
pull/117/head
nemobis 10 years ago
parent b74d6f79ce
commit a3fe96b782

@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2011-2012 WikiTeam
# Copyright (C) 2011-2014 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -15,28 +15,8 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# uploader.py
# This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
# The list must be a text file with the wiki's api.php URLs, one per line.
# Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
# as produced by launcher.py (explained in https://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump ).
# Adjust your configuration; see below "Configuration goes here".
# You also need dumpgenerator.py in the same directory as this script.
# Developing scratchpad
# Keys: http://archive.org/account/s3.php
# Documentation: http://archive.org/help/abouts3.txt
# https://github.com/kngenie/ias3upload
# http://en.ecgpedia.org/api.php?action=query&meta=siteinfo&siprop=general|rightsinfo&format=xml
#
# TODO: bug - upload may (partly) fail if two (small) files are sent to s3 without pause http://p.defau.lt/?puN_G_zKXbv1lz9TfSliPg http://archive.org/details/wiki-editionorg_w or something http://p.defau.lt/?udwrG7YQn4RK_1whl1XWRw http://archive.org/details/wiki-jutedreamhosterscom_lineageii_bestiary
# TODO: bug - translate relative copyright URLs as in http://archive.org/details/wiki-wikipovrayorg now linking http://archive.org/content/POV-Wiki:Copyrights
# TODO: minor bug - skip sites requiring authentication without asking user input (e.g. ilab.usc.edu)
# TODO: minor bug - don't overwrite existing files with same filename in the same identifier
# TODO: trivial bug - check for duplicates with originalurl http://archive.org/details/wiki-enecgpediaorg http://archive.org/details/wiki-en.ecgpedia.org
# TODO: enhancement - download wiki logo and upload as *-logo.png, should suffice to get the thumb used
# TODO: enhancement - fix escapement? http://archive.org/details/wiki-encitizendiumorg
import getopt
import hashlib
import os
import re
import subprocess
@ -52,8 +32,8 @@ import dumpgenerator
# You need a file named keys.txt with access and secret keys, in two different lines
accesskey = open('keys.txt', 'r').readlines()[0].strip()
secretkey = open('keys.txt', 'r').readlines()[1].strip()
collection = 'wikiteam' # Replace with "opensource" if you are not an admin of the collection
# end configuration
# Use --admin if you are a wikiteam collection admin, or specify another collection:
collection = 'opensource'
# Nothing to change below
convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
@ -65,12 +45,51 @@ except:
pass
print '%d dumps uploaded previously' % (len(uploadeddumps))
def getParameters(params=[]):
if not params:
params = sys.argv[2:]
config = {
'prune-directories': False,
'prune-wikidump': False,
'collection': collection
}
#console params
try:
opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
for o, a in opts:
if o in ("-h","--help"):
usage()
sys.exit()
elif o in ("--prune-directories"):
config['prune-directories'] = True
elif o in ("--prune-wikidump"):
config['prune-wikidump'] = True
elif o in ("--admin"):
config['collection'] = "wikiteam"
return config
def usage():
""" """
print """uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
The list must be a text file with the wiki's api.php URLs, one per line.
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
as produced by launcher.py (explained in https://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump ).
You need a file named keys.txt with access and secret keys, in two different lines
You also need dumpgenerator.py in the same directory as this script.
Use --help to print this help."""
def log(wiki, dump, msg):
f = open('uploader-%s.log' % (listfile), 'a')
f.write('\n%s;%s;%s' % (wiki, dump, msg))
f.close()
def upload(wikis):
def upload(wikis, config={}):
for wiki in wikis:
print "#"*73
print "# Uploading", wiki
@ -89,12 +108,32 @@ def upload(wikis):
c = 0
for dump in dumps:
wikidate = dump.split('-')[1]
if dump in uploadeddumps:
if config['prune-directories']:
rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
# With -f the deletion might have happened before and we won't know
if not os.system(rmline):
print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
# Simplistic quick&dirty to check for the presence of this file in the item
stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
dumphash = re.sub(' +.+\n?', '', stdout)
headers = {'User-Agent': dumpgenerator.getUserAgent()}
itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers)
if re.search(dumphash, urllib2.urlopen(itemdata).read()):
log(wiki, dump, 'verified')
rmline='rm -rf %s' % dump
if not os.system(rmline):
print 'DELETED ' + dump
else:
print 'ERROR: The online item misses ' + dump
log(wiki, dump, 'missing')
print '%s was uploaded before, skipping...' % (dump)
continue
time.sleep(0.1)
wikidate = dump.split('-')[1]
wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
print wiki, wikiname, wikidate, dump
@ -198,6 +237,7 @@ def upload(wikis):
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
else:
print 'Item already exists.'
lang = 'foo'
wikititle = 'foo'
wikidesc = 'foo'
@ -215,7 +255,7 @@ def upload(wikis):
]
if c == 0:
curl += ['--header', "'x-archive-meta-mediatype:web'",
'--header', "'x-archive-meta-collection:%s'" % (collection),
'--header', "'x-archive-meta-collection:%s'" % (config['collection']),
'--header', quoteattr('x-archive-meta-title:' + wikititle),
'--header', "'x-archive-meta-description:%s'" % wikidesc.replace("'", r"\'"),
'--header', quoteattr('x-archive-meta-language:' + lang),
@ -250,9 +290,11 @@ def upload(wikis):
os.system(curlmetaline)
c += 1
def main():
def main(params=[]):
config = getParameters(params=params)
wikis = open(listfile, 'r').read().strip().splitlines()
upload(wikis)
upload(wikis, config)
if __name__ == "__main__":
main()

Loading…
Cancel
Save