2014-02-26 23:22:53 +00:00
#!/usr/bin/env python2
2012-04-20 20:19:40 +00:00
# -*- coding: utf-8 -*-
2014-02-02 18:00:07 +00:00
# Copyright (C) 2011-2014 WikiTeam
2012-04-20 20:19:40 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
2014-02-02 11:58:49 +00:00
#
2012-04-20 20:19:40 +00:00
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
2014-02-02 11:58:49 +00:00
#
2012-04-20 20:19:40 +00:00
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2014-02-02 18:00:07 +00:00
import getopt
2012-04-20 20:57:36 +00:00
import os
import re
2012-04-20 20:37:31 +00:00
import subprocess
2012-08-03 11:07:29 +00:00
import sys
2012-08-06 12:12:37 +00:00
import time
2012-04-20 20:57:36 +00:00
import urllib
2012-08-06 12:12:37 +00:00
import urllib2
2015-03-29 21:30:15 +00:00
import urlparse
import StringIO
2014-01-29 21:26:04 +00:00
from xml . sax . saxutils import quoteattr
2015-03-29 18:50:50 +00:00
from internetarchive import get_item
2012-08-06 12:12:37 +00:00
import dumpgenerator
2012-04-20 20:37:31 +00:00
2012-05-04 13:18:52 +00:00
# Configuration goes here
2012-08-06 12:57:41 +00:00
# You need a file named keys.txt with access and secret keys, in two different lines
2013-11-16 14:29:34 +00:00
accesskey = open ( ' keys.txt ' , ' r ' ) . readlines ( ) [ 0 ] . strip ( )
secretkey = open ( ' keys.txt ' , ' r ' ) . readlines ( ) [ 1 ] . strip ( )
2014-02-02 18:00:07 +00:00
# Use --admin if you are a wikiteam collection admin, or specify another collection:
collection = ' opensource '
2012-04-20 20:37:31 +00:00
2012-08-06 18:39:27 +00:00
# Nothing to change below
convertlang = { ' ar ' : ' Arabic ' , ' de ' : ' German ' , ' en ' : ' English ' , ' es ' : ' Spanish ' , ' fr ' : ' French ' , ' it ' : ' Italian ' , ' ja ' : ' Japanese ' , ' nl ' : ' Dutch ' , ' pl ' : ' Polish ' , ' pt ' : ' Portuguese ' , ' ru ' : ' Russian ' }
2012-08-06 15:57:11 +00:00
listfile = sys . argv [ 1 ]
uploadeddumps = [ ]
try :
2015-04-07 01:06:00 +00:00
uploadeddumps = [ l . split ( ' ; ' ) [ 1 ] for l in open ( ' uploader- %s .log ' % ( listfile ) , ' r ' ) . read ( ) . strip ( ) . splitlines ( ) if len ( l . split ( ' ; ' ) ) > 1 ]
2012-08-06 15:57:11 +00:00
except :
pass
print ' %d dumps uploaded previously ' % ( len ( uploadeddumps ) )
2014-02-02 18:00:07 +00:00
def getParameters ( params = [ ] ) :
if not params :
params = sys . argv [ 2 : ]
config = {
' prune-directories ' : False ,
' prune-wikidump ' : False ,
2015-03-29 21:30:15 +00:00
' collection ' : collection ,
' update ' : False ,
2014-02-02 18:00:07 +00:00
}
#console params
try :
2015-03-29 21:30:15 +00:00
opts , args = getopt . getopt ( params , " " , [ " h " , " help " , " prune-directories " , " prune-wikidump " , " admin " , " update " ] )
2014-02-02 18:00:07 +00:00
except getopt . GetoptError , err :
# print help information and exit:
print str ( err ) # will print something like "option -a not recognized"
usage ( )
sys . exit ( 2 )
for o , a in opts :
if o in ( " -h " , " --help " ) :
usage ( )
sys . exit ( )
elif o in ( " --prune-directories " ) :
config [ ' prune-directories ' ] = True
elif o in ( " --prune-wikidump " ) :
config [ ' prune-wikidump ' ] = True
elif o in ( " --admin " ) :
config [ ' collection ' ] = " wikiteam "
2015-03-29 21:30:15 +00:00
elif o in ( " --update " ) :
config [ ' update ' ] = True
2014-02-02 18:00:07 +00:00
return config
def usage ( ) :
""" """
print """ uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive . org .
The list must be a text file with the wiki ' s api.php URLs, one per line.
Dumps must be in the same directory and follow the - wikidump .7 z / - history . xml .7 z format
as produced by launcher . py ( explained in https : / / code . google . com / p / wikiteam / wiki / NewTutorial #Publishing_the_dump ).
You need a file named keys . txt with access and secret keys , in two different lines
You also need dumpgenerator . py in the same directory as this script .
Use - - help to print this help . """
2012-08-06 13:33:04 +00:00
def log ( wiki , dump , msg ) :
2012-08-06 12:57:41 +00:00
f = open ( ' uploader- %s .log ' % ( listfile ) , ' a ' )
2012-08-06 13:33:04 +00:00
f . write ( ' \n %s ; %s ; %s ' % ( wiki , dump , msg ) )
2012-08-06 12:57:41 +00:00
f . close ( )
2014-02-02 18:00:07 +00:00
def upload ( wikis , config = { } ) :
2015-03-29 18:30:42 +00:00
headers = { ' User-Agent ' : dumpgenerator . getUserAgent ( ) }
2012-08-06 12:12:37 +00:00
for wiki in wikis :
print " # " * 73
print " # Uploading " , wiki
print " # " * 73
wiki = wiki . lower ( )
prefix = dumpgenerator . domain2prefix ( config = { ' api ' : wiki } )
2013-11-15 07:49:06 +00:00
2012-08-06 12:12:37 +00:00
wikiname = prefix . split ( ' - ' ) [ 0 ]
dumps = [ ]
for dirname , dirnames , filenames in os . walk ( ' . ' ) :
if dirname == ' . ' :
for f in filenames :
if f . startswith ( ' %s - ' % ( wikiname ) ) and ( f . endswith ( ' -wikidump.7z ' ) or f . endswith ( ' -history.xml.7z ' ) ) :
dumps . append ( f )
break
2012-04-20 21:32:08 +00:00
c = 0
2012-04-20 21:14:46 +00:00
for dump in dumps :
2014-02-02 18:00:07 +00:00
wikidate = dump . split ( ' - ' ) [ 1 ]
2015-03-29 18:30:42 +00:00
item = get_item ( ' wiki- ' + wikiname )
2012-08-06 15:57:11 +00:00
if dump in uploadeddumps :
2014-02-02 18:00:07 +00:00
if config [ ' prune-directories ' ] :
rmline = ' rm -rf %s - %s -wikidump/ ' % ( wikiname , wikidate )
# With -f the deletion might have happened before and we won't know
if not os . system ( rmline ) :
print ' DELETED %s - %s -wikidump/ ' % ( wikiname , wikidate )
if config [ ' prune-wikidump ' ] and dump . endswith ( ' wikidump.7z ' ) :
2014-02-10 23:05:44 +00:00
# Simplistic quick&dirty check for the presence of this file in the item
2014-02-02 18:00:07 +00:00
stdout , stderr = subprocess . Popen ( [ " md5sum " , dump ] , stdout = subprocess . PIPE , stderr = subprocess . PIPE ) . communicate ( )
dumphash = re . sub ( ' +.+ \n ? ' , ' ' , stdout )
2015-03-29 18:30:42 +00:00
if dumphash in map ( lambda x : x [ ' md5 ' ] , item . files ) :
2014-02-02 18:00:07 +00:00
log ( wiki , dump , ' verified ' )
rmline = ' rm -rf %s ' % dump
if not os . system ( rmline ) :
print ' DELETED ' + dump
2014-02-10 23:05:44 +00:00
print ' %s was uploaded before, skipping... ' % ( dump )
continue
2014-02-02 18:00:07 +00:00
else :
print ' ERROR: The online item misses ' + dump
log ( wiki , dump , ' missing ' )
2014-02-10 23:05:44 +00:00
# We'll exit this if and go upload the dump
else :
print ' %s was uploaded before, skipping... ' % ( dump )
continue
2014-02-02 11:58:49 +00:00
2012-08-06 12:12:37 +00:00
time . sleep ( 0.1 )
2012-08-06 14:05:45 +00:00
wikidate_text = wikidate [ 0 : 4 ] + ' - ' + wikidate [ 4 : 6 ] + ' - ' + wikidate [ 6 : 8 ]
2012-04-20 21:14:46 +00:00
print wiki , wikiname , wikidate , dump
2014-02-02 11:58:49 +00:00
# Does the item exist already?
2015-03-29 18:30:42 +00:00
ismissingitem = not item . exists
2014-02-02 11:58:49 +00:00
2015-03-29 21:30:15 +00:00
# Logo path
logourl = ' '
2014-02-02 11:58:49 +00:00
# We don't know a way to fix/overwrite metadata if item exists already:
# just pass bogus data and save some time
2015-03-29 21:30:15 +00:00
if ismissingitem or config [ ' update ' ] :
2014-02-02 11:58:49 +00:00
#get metadata from api.php
#first sitename and base url
params = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
2012-08-06 15:41:50 +00:00
try :
2014-02-02 11:58:49 +00:00
f = urllib2 . urlopen ( req )
xml = f . read ( )
2012-08-06 15:41:50 +00:00
f . close ( )
except :
pass
2014-02-02 11:58:49 +00:00
sitename = ' '
baseurl = ' '
lang = ' '
try :
sitename = re . findall ( ur " sitename= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
try :
baseurl = re . findall ( ur " base= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
2012-08-06 13:33:04 +00:00
try :
2014-02-02 11:58:49 +00:00
lang = re . findall ( ur " lang= \" ([^ \" ]+) \" " , xml ) [ 0 ]
2012-08-06 13:33:04 +00:00
except :
pass
2014-02-02 11:58:49 +00:00
if not sitename :
sitename = wikiname
if not baseurl :
baseurl = re . sub ( ur " (?im)/api \ .php " , ur " " , wiki )
if lang :
lang = convertlang . has_key ( lang . lower ( ) ) and convertlang [ lang . lower ( ) ] or lang . lower ( )
#now copyright info from API
params = { ' action ' : ' query ' , ' siprop ' : ' general|rightsinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
try :
f = urllib2 . urlopen ( req )
xml = f . read ( )
f . close ( )
except :
pass
rightsinfourl = ' '
rightsinfotext = ' '
2012-08-06 13:33:04 +00:00
try :
2014-02-02 11:58:49 +00:00
rightsinfourl = re . findall ( ur " rightsinfo url= \" ([^ \" ]+) \" " , xml ) [ 0 ]
rightsinfotext = re . findall ( ur " text= \" ([^ \" ]+) \" " , xml ) [ 0 ]
2012-08-06 13:33:04 +00:00
except :
pass
2014-02-02 11:58:49 +00:00
2015-03-29 21:30:15 +00:00
raw = ' '
try :
f = urllib . urlopen ( baseurl )
raw = f . read ( )
f . close ( )
except :
pass
2014-02-02 11:58:49 +00:00
#or copyright info from #footer in mainpage
if baseurl and not rightsinfourl and not rightsinfotext :
rightsinfotext = ' '
rightsinfourl = ' '
try :
rightsinfourl = re . findall ( ur " <link rel= \" copyright \" href= \" ([^ \" ]+) \" /> " , raw ) [ 0 ]
except :
pass
try :
rightsinfotext = re . findall ( ur " <li id= \" copyright \" >([^ \n \r ]*?)</li> " , raw ) [ 0 ]
except :
pass
if rightsinfotext and not rightsinfourl :
rightsinfourl = baseurl + ' #footer '
2015-03-29 21:30:15 +00:00
try :
logourl = re . findall ( ur ' p-logo[ " \' ][^>]*> \ s*<a [^>]*background-image: \ s*(?:url \ ()?([^;) " ]+) ' , raw ) [ 0 ]
except :
pass
print logourl
2014-02-02 11:58:49 +00:00
#retrieve some info from the wiki
wikititle = " Wiki - %s " % ( sitename ) # Wiki - ECGpedia
2014-11-27 19:12:27 +00:00
wikidesc = " <a href= \" %s \" > %s </a> dumped with <a href= \" https://github.com/WikiTeam/wikiteam \" rel= \" nofollow \" >WikiTeam</a> tools. " % ( baseurl , sitename ) # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools."
2014-02-02 11:58:49 +00:00
wikikeys = [ ' wiki ' , ' wikiteam ' , ' MediaWiki ' , sitename , wikiname ] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
if not rightsinfourl and not rightsinfotext :
wikikeys . append ( ' unknowncopyright ' )
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
else :
2014-02-02 18:00:07 +00:00
print ' Item already exists. '
2014-02-02 11:58:49 +00:00
lang = ' foo '
wikititle = ' foo '
wikidesc = ' foo '
wikikeys = ' foo '
wikilicenseurl = ' foo '
wikirights = ' foo '
wikiurl = ' foo '
2012-04-20 21:32:08 +00:00
if c == 0 :
2015-03-29 18:30:42 +00:00
# Item metadata
md = {
' mediatype ' : ' web ' ,
' collection ' : config [ ' collection ' ] ,
' title ' : wikititle ,
' description ' : wikidesc ,
' language ' : lang ,
' last-updated-date ' : wikidate_text ,
' subject ' : ' ; ' . join ( wikikeys ) , # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
2015-04-07 01:06:00 +00:00
' licenseurl ' : wikilicenseurl and urlparse . urljoin ( wiki , wikilicenseurl ) ,
2015-03-29 18:30:42 +00:00
' rights ' : wikirights ,
' originalurl ' : wikiurl ,
}
2014-02-02 11:58:49 +00:00
2014-01-29 15:55:16 +00:00
#now also to update the metadata
#TODO: not needed for the second file in an item
2015-03-29 18:30:42 +00:00
try :
2015-03-29 18:50:50 +00:00
item . upload ( dump , metadata = md , access_key = accesskey , secret_key = secretkey , verbose = True )
2015-03-29 21:30:15 +00:00
if logourl :
logo = StringIO . StringIO ( urllib . urlopen ( urlparse . urljoin ( wiki , logourl ) ) . read ( ) )
logoextension = logourl . split ( ' . ' ) [ - 1 ] if logourl . split ( ' . ' ) else ' unknown '
logo . name = ' wiki- ' + wikiname + ' _logo. ' + logoextension
item . upload ( logo , access_key = accesskey , secret_key = secretkey , verbose = True )
2014-01-29 10:44:52 +00:00
uploadeddumps . append ( dump )
log ( wiki , dump , ' ok ' )
2015-03-29 18:30:42 +00:00
except :
2015-04-07 01:06:00 +00:00
print wiki , dump , ' error when uploading? '
2015-03-29 21:30:15 +00:00
2012-04-20 21:32:08 +00:00
c + = 1
2012-04-20 20:57:36 +00:00
2014-02-02 18:00:07 +00:00
def main ( params = [ ] ) :
config = getParameters ( params = params )
2012-08-06 13:33:04 +00:00
wikis = open ( listfile , ' r ' ) . read ( ) . strip ( ) . splitlines ( )
2014-02-02 18:00:07 +00:00
upload ( wikis , config )
2012-04-20 20:57:36 +00:00
if __name__ == " __main__ " :
main ( )