@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2011-201 2 WikiTeam
# Copyright (C) 2011-201 4 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -15,28 +15,8 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# uploader.py
# This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
# The list must be a text file with the wiki's api.php URLs, one per line.
# Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
# as produced by launcher.py (explained in https://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump ).
# Adjust your configuration; see below "Configuration goes here".
# You also need dumpgenerator.py in the same directory as this script.
# Developing scratchpad
# Keys: http://archive.org/account/s3.php
# Documentation: http://archive.org/help/abouts3.txt
# https://github.com/kngenie/ias3upload
# http://en.ecgpedia.org/api.php?action=query&meta=siteinfo&siprop=general|rightsinfo&format=xml
#
# TODO: bug - upload may (partly) fail if two (small) files are sent to s3 without pause http://p.defau.lt/?puN_G_zKXbv1lz9TfSliPg http://archive.org/details/wiki-editionorg_w or something http://p.defau.lt/?udwrG7YQn4RK_1whl1XWRw http://archive.org/details/wiki-jutedreamhosterscom_lineageii_bestiary
# TODO: bug - translate relative copyright URLs as in http://archive.org/details/wiki-wikipovrayorg now linking http://archive.org/content/POV-Wiki:Copyrights
# TODO: minor bug - skip sites requiring authentication without asking user input (e.g. ilab.usc.edu)
# TODO: minor bug - don't overwrite existing files with same filename in the same identifier
# TODO: trivial bug - check for duplicates with originalurl http://archive.org/details/wiki-enecgpediaorg http://archive.org/details/wiki-en.ecgpedia.org
# TODO: enhancement - download wiki logo and upload as *-logo.png, should suffice to get the thumb used
# TODO: enhancement - fix escapement? http://archive.org/details/wiki-encitizendiumorg
import getopt
import hashlib
import os
import re
import subprocess
@ -52,8 +32,8 @@ import dumpgenerator
# You need a file named keys.txt with access and secret keys, in two different lines
accesskey = open ( ' keys.txt ' , ' r ' ) . readlines ( ) [ 0 ] . strip ( )
secretkey = open ( ' keys.txt ' , ' r ' ) . readlines ( ) [ 1 ] . strip ( )
collection = ' wikiteam ' # Replace with "opensource" if you are not an admin of the collection
# end configuration
# Use --admin if you are a wikiteam collection admin, or specify another collection:
collection = ' opensource '
# Nothing to change below
convertlang = { ' ar ' : ' Arabic ' , ' de ' : ' German ' , ' en ' : ' English ' , ' es ' : ' Spanish ' , ' fr ' : ' French ' , ' it ' : ' Italian ' , ' ja ' : ' Japanese ' , ' nl ' : ' Dutch ' , ' pl ' : ' Polish ' , ' pt ' : ' Portuguese ' , ' ru ' : ' Russian ' }
@ -65,12 +45,51 @@ except:
pass
print ' %d dumps uploaded previously ' % ( len ( uploadeddumps ) )
def getParameters ( params = [ ] ) :
if not params :
params = sys . argv [ 2 : ]
config = {
' prune-directories ' : False ,
' prune-wikidump ' : False ,
' collection ' : collection
}
#console params
try :
opts , args = getopt . getopt ( params , " " , [ " h " , " help " , " prune-directories " , " prune-wikidump " , " admin " ] )
except getopt . GetoptError , err :
# print help information and exit:
print str ( err ) # will print something like "option -a not recognized"
usage ( )
sys . exit ( 2 )
for o , a in opts :
if o in ( " -h " , " --help " ) :
usage ( )
sys . exit ( )
elif o in ( " --prune-directories " ) :
config [ ' prune-directories ' ] = True
elif o in ( " --prune-wikidump " ) :
config [ ' prune-wikidump ' ] = True
elif o in ( " --admin " ) :
config [ ' collection ' ] = " wikiteam "
return config
def usage ( ) :
""" """
print """ uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive . org .
The list must be a text file with the wiki ' s api.php URLs, one per line.
Dumps must be in the same directory and follow the - wikidump .7 z / - history . xml .7 z format
as produced by launcher . py ( explained in https : / / code . google . com / p / wikiteam / wiki / NewTutorial #Publishing_the_dump ).
You need a file named keys . txt with access and secret keys , in two different lines
You also need dumpgenerator . py in the same directory as this script .
Use - - help to print this help . """
def log ( wiki , dump , msg ) :
f = open ( ' uploader- %s .log ' % ( listfile ) , ' a ' )
f . write ( ' \n %s ; %s ; %s ' % ( wiki , dump , msg ) )
f . close ( )
def upload ( wikis ) :
def upload ( wikis , config = { } ):
for wiki in wikis :
print " # " * 73
print " # Uploading " , wiki
@ -89,12 +108,32 @@ def upload(wikis):
c = 0
for dump in dumps :
wikidate = dump . split ( ' - ' ) [ 1 ]
if dump in uploadeddumps :
if config [ ' prune-directories ' ] :
rmline = ' rm -rf %s - %s -wikidump/ ' % ( wikiname , wikidate )
# With -f the deletion might have happened before and we won't know
if not os . system ( rmline ) :
print ' DELETED %s - %s -wikidump/ ' % ( wikiname , wikidate )
if config [ ' prune-wikidump ' ] and dump . endswith ( ' wikidump.7z ' ) :
# Simplistic quick&dirty to check for the presence of this file in the item
stdout , stderr = subprocess . Popen ( [ " md5sum " , dump ] , stdout = subprocess . PIPE , stderr = subprocess . PIPE ) . communicate ( )
dumphash = re . sub ( ' +.+ \n ? ' , ' ' , stdout )
headers = { ' User-Agent ' : dumpgenerator . getUserAgent ( ) }
itemdata = urllib2 . Request ( url = ' http://archive.org/metadata/wiki- ' + wikiname , headers = headers )
if re . search ( dumphash , urllib2 . urlopen ( itemdata ) . read ( ) ) :
log ( wiki , dump , ' verified ' )
rmline = ' rm -rf %s ' % dump
if not os . system ( rmline ) :
print ' DELETED ' + dump
else :
print ' ERROR: The online item misses ' + dump
log ( wiki , dump , ' missing ' )
print ' %s was uploaded before, skipping... ' % ( dump )
continue
time . sleep ( 0.1 )
wikidate = dump . split ( ' - ' ) [ 1 ]
wikidate_text = wikidate [ 0 : 4 ] + ' - ' + wikidate [ 4 : 6 ] + ' - ' + wikidate [ 6 : 8 ]
print wiki , wikiname , wikidate , dump
@ -198,6 +237,7 @@ def upload(wikis):
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
else :
print ' Item already exists. '
lang = ' foo '
wikititle = ' foo '
wikidesc = ' foo '
@ -215,7 +255,7 @@ def upload(wikis):
]
if c == 0 :
curl + = [ ' --header ' , " ' x-archive-meta-mediatype:web ' " ,
' --header ' , " ' x-archive-meta-collection: %s ' " % ( co llection) ,
' --header ' , " ' x-archive-meta-collection: %s ' " % ( co nfig[ ' co llection' ] ) ,
' --header ' , quoteattr ( ' x-archive-meta-title: ' + wikititle ) ,
' --header ' , " ' x-archive-meta-description: %s ' " % wikidesc . replace ( " ' " , r " \ ' " ) ,
' --header ' , quoteattr ( ' x-archive-meta-language: ' + lang ) ,
@ -250,9 +290,11 @@ def upload(wikis):
os . system ( curlmetaline )
c + = 1
def main ( ) :
def main ( params = [ ] ) :
config = getParameters ( params = params )
wikis = open ( listfile , ' r ' ) . read ( ) . strip ( ) . splitlines ( )
upload ( wikis )
upload ( wikis , config )
if __name__ == " __main__ " :
main ( )