2014-02-26 23:22:53 +00:00
#!/usr/bin/env python2
2012-04-20 20:19:40 +00:00
# -*- coding: utf-8 -*-
2014-02-02 18:00:07 +00:00
# Copyright (C) 2011-2014 WikiTeam
2012-04-20 20:19:40 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
2014-02-02 11:58:49 +00:00
#
2012-04-20 20:19:40 +00:00
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
2014-02-02 11:58:49 +00:00
#
2012-04-20 20:19:40 +00:00
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2014-02-02 18:00:07 +00:00
import getopt
2012-04-20 20:57:36 +00:00
import os
import re
2012-04-20 20:37:31 +00:00
import subprocess
2012-08-03 11:07:29 +00:00
import sys
2012-08-06 12:12:37 +00:00
import time
2012-04-20 20:57:36 +00:00
import urllib
2012-08-06 12:12:37 +00:00
import urllib2
2014-01-29 21:26:04 +00:00
from xml . sax . saxutils import quoteattr
2012-08-06 12:12:37 +00:00
import dumpgenerator
2012-04-20 20:37:31 +00:00
2012-05-04 13:18:52 +00:00
# Configuration goes here
2012-08-06 12:57:41 +00:00
# You need a file named keys.txt with access and secret keys, in two different lines
2013-11-16 14:29:34 +00:00
accesskey = open ( ' keys.txt ' , ' r ' ) . readlines ( ) [ 0 ] . strip ( )
secretkey = open ( ' keys.txt ' , ' r ' ) . readlines ( ) [ 1 ] . strip ( )
2014-02-02 18:00:07 +00:00
# Use --admin if you are a wikiteam collection admin, or specify another collection:
collection = ' opensource '
2012-04-20 20:37:31 +00:00
2012-08-06 18:39:27 +00:00
# Nothing to change below
convertlang = { ' ar ' : ' Arabic ' , ' de ' : ' German ' , ' en ' : ' English ' , ' es ' : ' Spanish ' , ' fr ' : ' French ' , ' it ' : ' Italian ' , ' ja ' : ' Japanese ' , ' nl ' : ' Dutch ' , ' pl ' : ' Polish ' , ' pt ' : ' Portuguese ' , ' ru ' : ' Russian ' }
2012-08-06 15:57:11 +00:00
listfile = sys . argv [ 1 ]
uploadeddumps = [ ]
try :
uploadeddumps = [ l . split ( ' ; ' ) [ 1 ] for l in open ( ' uploader- %s .log ' % ( listfile ) , ' r ' ) . read ( ) . strip ( ) . splitlines ( ) ]
except :
pass
print ' %d dumps uploaded previously ' % ( len ( uploadeddumps ) )
2014-02-02 18:00:07 +00:00
def getParameters ( params = [ ] ) :
if not params :
params = sys . argv [ 2 : ]
config = {
' prune-directories ' : False ,
' prune-wikidump ' : False ,
' collection ' : collection
}
#console params
try :
opts , args = getopt . getopt ( params , " " , [ " h " , " help " , " prune-directories " , " prune-wikidump " , " admin " ] )
except getopt . GetoptError , err :
# print help information and exit:
print str ( err ) # will print something like "option -a not recognized"
usage ( )
sys . exit ( 2 )
for o , a in opts :
if o in ( " -h " , " --help " ) :
usage ( )
sys . exit ( )
elif o in ( " --prune-directories " ) :
config [ ' prune-directories ' ] = True
elif o in ( " --prune-wikidump " ) :
config [ ' prune-wikidump ' ] = True
elif o in ( " --admin " ) :
config [ ' collection ' ] = " wikiteam "
return config
def usage ( ) :
""" """
print """ uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive . org .
The list must be a text file with the wiki ' s api.php URLs, one per line.
Dumps must be in the same directory and follow the - wikidump .7 z / - history . xml .7 z format
as produced by launcher . py ( explained in https : / / code . google . com / p / wikiteam / wiki / NewTutorial #Publishing_the_dump ).
You need a file named keys . txt with access and secret keys , in two different lines
You also need dumpgenerator . py in the same directory as this script .
Use - - help to print this help . """
2012-08-06 13:33:04 +00:00
def log ( wiki , dump , msg ) :
2012-08-06 12:57:41 +00:00
f = open ( ' uploader- %s .log ' % ( listfile ) , ' a ' )
2012-08-06 13:33:04 +00:00
f . write ( ' \n %s ; %s ; %s ' % ( wiki , dump , msg ) )
2012-08-06 12:57:41 +00:00
f . close ( )
2014-02-02 18:00:07 +00:00
def upload ( wikis , config = { } ) :
2012-08-06 12:12:37 +00:00
for wiki in wikis :
print " # " * 73
print " # Uploading " , wiki
print " # " * 73
wiki = wiki . lower ( )
prefix = dumpgenerator . domain2prefix ( config = { ' api ' : wiki } )
2013-11-15 07:49:06 +00:00
2012-08-06 12:12:37 +00:00
wikiname = prefix . split ( ' - ' ) [ 0 ]
dumps = [ ]
for dirname , dirnames , filenames in os . walk ( ' . ' ) :
if dirname == ' . ' :
for f in filenames :
if f . startswith ( ' %s - ' % ( wikiname ) ) and ( f . endswith ( ' -wikidump.7z ' ) or f . endswith ( ' -history.xml.7z ' ) ) :
dumps . append ( f )
break
2012-04-20 21:32:08 +00:00
c = 0
2012-04-20 21:14:46 +00:00
for dump in dumps :
2014-02-02 18:00:07 +00:00
wikidate = dump . split ( ' - ' ) [ 1 ]
2012-08-06 15:57:11 +00:00
if dump in uploadeddumps :
2014-02-02 18:00:07 +00:00
if config [ ' prune-directories ' ] :
rmline = ' rm -rf %s - %s -wikidump/ ' % ( wikiname , wikidate )
# With -f the deletion might have happened before and we won't know
if not os . system ( rmline ) :
print ' DELETED %s - %s -wikidump/ ' % ( wikiname , wikidate )
if config [ ' prune-wikidump ' ] and dump . endswith ( ' wikidump.7z ' ) :
2014-02-10 23:05:44 +00:00
# Simplistic quick&dirty check for the presence of this file in the item
2014-02-02 18:00:07 +00:00
stdout , stderr = subprocess . Popen ( [ " md5sum " , dump ] , stdout = subprocess . PIPE , stderr = subprocess . PIPE ) . communicate ( )
dumphash = re . sub ( ' +.+ \n ? ' , ' ' , stdout )
headers = { ' User-Agent ' : dumpgenerator . getUserAgent ( ) }
itemdata = urllib2 . Request ( url = ' http://archive.org/metadata/wiki- ' + wikiname , headers = headers )
if re . search ( dumphash , urllib2 . urlopen ( itemdata ) . read ( ) ) :
log ( wiki , dump , ' verified ' )
rmline = ' rm -rf %s ' % dump
if not os . system ( rmline ) :
print ' DELETED ' + dump
2014-02-10 23:05:44 +00:00
print ' %s was uploaded before, skipping... ' % ( dump )
continue
2014-02-02 18:00:07 +00:00
else :
print ' ERROR: The online item misses ' + dump
log ( wiki , dump , ' missing ' )
2014-02-10 23:05:44 +00:00
# We'll exit this if and go upload the dump
else :
print ' %s was uploaded before, skipping... ' % ( dump )
continue
2014-02-02 11:58:49 +00:00
2012-08-06 12:12:37 +00:00
time . sleep ( 0.1 )
2012-08-06 14:05:45 +00:00
wikidate_text = wikidate [ 0 : 4 ] + ' - ' + wikidate [ 4 : 6 ] + ' - ' + wikidate [ 6 : 8 ]
2012-04-20 21:14:46 +00:00
print wiki , wikiname , wikidate , dump
2014-02-02 11:58:49 +00:00
# Does the item exist already?
2012-08-06 12:12:37 +00:00
headers = { ' User-Agent ' : dumpgenerator . getUserAgent ( ) }
2014-02-02 11:58:49 +00:00
itemdata = urllib2 . Request ( url = ' http://archive.org/metadata/wiki- ' + wikiname , headers = headers )
if urllib2 . urlopen ( itemdata ) . read ( ) == ' {} ' :
ismissingitem = True
else :
ismissingitem = False
# We don't know a way to fix/overwrite metadata if item exists already:
# just pass bogus data and save some time
if ismissingitem :
#get metadata from api.php
#first sitename and base url
params = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
2012-08-06 15:41:50 +00:00
try :
2014-02-02 11:58:49 +00:00
f = urllib2 . urlopen ( req )
xml = f . read ( )
2012-08-06 15:41:50 +00:00
f . close ( )
except :
pass
2014-02-02 11:58:49 +00:00
sitename = ' '
baseurl = ' '
lang = ' '
try :
sitename = re . findall ( ur " sitename= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
try :
baseurl = re . findall ( ur " base= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
2012-08-06 13:33:04 +00:00
try :
2014-02-02 11:58:49 +00:00
lang = re . findall ( ur " lang= \" ([^ \" ]+) \" " , xml ) [ 0 ]
2012-08-06 13:33:04 +00:00
except :
pass
2014-02-02 11:58:49 +00:00
if not sitename :
sitename = wikiname
if not baseurl :
baseurl = re . sub ( ur " (?im)/api \ .php " , ur " " , wiki )
if lang :
lang = convertlang . has_key ( lang . lower ( ) ) and convertlang [ lang . lower ( ) ] or lang . lower ( )
#now copyright info from API
params = { ' action ' : ' query ' , ' siprop ' : ' general|rightsinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
try :
f = urllib2 . urlopen ( req )
xml = f . read ( )
f . close ( )
except :
pass
rightsinfourl = ' '
rightsinfotext = ' '
2012-08-06 13:33:04 +00:00
try :
2014-02-02 11:58:49 +00:00
rightsinfourl = re . findall ( ur " rightsinfo url= \" ([^ \" ]+) \" " , xml ) [ 0 ]
rightsinfotext = re . findall ( ur " text= \" ([^ \" ]+) \" " , xml ) [ 0 ]
2012-08-06 13:33:04 +00:00
except :
pass
2014-02-02 11:58:49 +00:00
#or copyright info from #footer in mainpage
if baseurl and not rightsinfourl and not rightsinfotext :
raw = ' '
try :
f = urllib . urlopen ( baseurl )
raw = f . read ( )
f . close ( )
except :
pass
rightsinfotext = ' '
rightsinfourl = ' '
try :
rightsinfourl = re . findall ( ur " <link rel= \" copyright \" href= \" ([^ \" ]+) \" /> " , raw ) [ 0 ]
except :
pass
try :
rightsinfotext = re . findall ( ur " <li id= \" copyright \" >([^ \n \r ]*?)</li> " , raw ) [ 0 ]
except :
pass
if rightsinfotext and not rightsinfourl :
rightsinfourl = baseurl + ' #footer '
#retrieve some info from the wiki
wikititle = " Wiki - %s " % ( sitename ) # Wiki - ECGpedia
wikidesc = " <a href= \" %s \" > %s </a> dumped with <a href= \" http://code.google.com/p/wikiteam/ \" rel= \" nofollow \" >WikiTeam</a> tools. " % ( baseurl , sitename ) # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools."
wikikeys = [ ' wiki ' , ' wikiteam ' , ' MediaWiki ' , sitename , wikiname ] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
if not rightsinfourl and not rightsinfotext :
wikikeys . append ( ' unknowncopyright ' )
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
else :
2014-02-02 18:00:07 +00:00
print ' Item already exists. '
2014-02-02 11:58:49 +00:00
lang = ' foo '
wikititle = ' foo '
wikidesc = ' foo '
wikikeys = ' foo '
wikilicenseurl = ' foo '
wikirights = ' foo '
wikiurl = ' foo '
2012-04-20 21:32:08 +00:00
#creates curl command
2014-02-02 11:58:49 +00:00
curl = [ ' curl ' , ' --location ' ,
2012-05-04 13:18:52 +00:00
' --header ' , " ' x-amz-auto-make-bucket:1 ' " , # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error
2012-04-22 09:00:19 +00:00
' --header ' , " ' x-archive-queue-derive:0 ' " ,
2014-02-02 11:58:49 +00:00
' --header ' , " ' x-archive-size-hint: %d ' " % ( os . path . getsize ( dump ) ) ,
2012-04-20 21:32:08 +00:00
' --header ' , " ' authorization: LOW %s : %s ' " % ( accesskey , secretkey ) ,
]
if c == 0 :
curl + = [ ' --header ' , " ' x-archive-meta-mediatype:web ' " ,
2014-02-02 18:00:07 +00:00
' --header ' , " ' x-archive-meta-collection: %s ' " % ( config [ ' collection ' ] ) ,
2014-01-29 22:34:25 +00:00
' --header ' , quoteattr ( ' x-archive-meta-title: ' + wikititle ) ,
' --header ' , " ' x-archive-meta-description: %s ' " % wikidesc . replace ( " ' " , r " \ ' " ) ,
' --header ' , quoteattr ( ' x-archive-meta-language: ' + lang ) ,
2012-08-06 14:07:46 +00:00
' --header ' , " ' x-archive-meta-last-updated-date: %s ' " % ( wikidate_text ) ,
2012-05-04 13:18:52 +00:00
' --header ' , " ' x-archive-meta-subject: %s ' " % ( ' ; ' . join ( wikikeys ) ) , # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
2014-01-29 22:34:25 +00:00
' --header ' , quoteattr ( ' x-archive-meta-licenseurl: ' + wikilicenseurl ) ,
' --header ' , " ' x-archive-meta-rights: %s ' " % wikirights . replace ( " ' " , r " \ ' " ) ,
' --header ' , quoteattr ( ' x-archive-meta-originalurl: ' + wikiurl ) ,
2012-04-20 21:32:08 +00:00
]
2014-02-02 11:58:49 +00:00
2012-04-20 21:32:08 +00:00
curl + = [ ' --upload-file ' , " %s " % ( dump ) ,
2013-11-23 12:19:46 +00:00
" http://s3.us.archive.org/wiki- %s / %s " % ( wikiname , dump ) , # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3.
2014-01-29 22:34:25 +00:00
' > /dev/null ' ,
#FIXME: Must be NUL instead on Windows, how to make compatible?
2012-04-20 21:32:08 +00:00
]
2014-01-29 15:55:16 +00:00
#now also to update the metadata
#TODO: not needed for the second file in an item
2014-01-29 21:26:04 +00:00
curlmeta = [ ' curl --silent ' ,
2014-01-29 15:55:16 +00:00
' --data-urlencode -target=metadata ' ,
""" --data-urlencode -patch= ' { " replace " : " /last-updated-date " , " value " : " %s " } ' """ % ( wikidate_text ) ,
' --data-urlencode access= ' + accesskey ,
' --data-urlencode secret= ' + secretkey ,
2014-01-29 21:26:04 +00:00
' http://archive.org/metadata/wiki- ' + wikiname ,
' > /dev/null '
2014-01-29 15:55:16 +00:00
]
2012-08-03 11:07:29 +00:00
curlline = ' ' . join ( curl )
2014-01-29 15:55:16 +00:00
curlmetaline = ' ' . join ( curlmeta )
if not os . system ( curlline ) :
2014-01-29 10:44:52 +00:00
uploadeddumps . append ( dump )
log ( wiki , dump , ' ok ' )
2014-02-02 11:58:49 +00:00
if not ismissingitem :
os . system ( curlmetaline )
2012-04-20 21:32:08 +00:00
c + = 1
2012-04-20 20:57:36 +00:00
2014-02-02 18:00:07 +00:00
def main ( params = [ ] ) :
config = getParameters ( params = params )
2012-08-06 13:33:04 +00:00
wikis = open ( listfile , ' r ' ) . read ( ) . strip ( ) . splitlines ( )
2014-02-02 18:00:07 +00:00
upload ( wikis , config )
2012-04-20 20:57:36 +00:00
if __name__ == " __main__ " :
main ( )