2012-04-20 20:19:40 +00:00
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2011-2012 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2013-10-16 17:57:02 +00:00
# uploader.py
# This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
# The list must be a text file with the wiki's api.php URLs, one per line.
# Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
# as produced by launcher.py (explained in https://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump ).
# Adjust your configuration; see below "Configuration goes here".
# You also need dumpgenerator.py in the same directory as this script.
# Developing scratchpad
2012-05-04 13:18:52 +00:00
# Keys: http://archive.org/account/s3.php
# Documentation: http://archive.org/help/abouts3.txt
2013-10-16 17:57:02 +00:00
# https://github.com/kngenie/ias3upload
2012-08-06 13:33:04 +00:00
# http://en.ecgpedia.org/api.php?action=query&meta=siteinfo&siprop=general|rightsinfo&format=xml
2012-08-06 22:24:50 +00:00
#
2012-08-07 06:41:53 +00:00
# TODO: bug - upload may (partly) fail if two (small) files are sent to s3 without pause http://p.defau.lt/?puN_G_zKXbv1lz9TfSliPg http://archive.org/details/wiki-editionorg_w or something http://p.defau.lt/?udwrG7YQn4RK_1whl1XWRw http://archive.org/details/wiki-jutedreamhosterscom_lineageii_bestiary
2012-08-08 17:27:40 +00:00
# TODO: bug - translate relative copyright URLs as in http://archive.org/details/wiki-wikipovrayorg now linking http://archive.org/content/POV-Wiki:Copyrights
2012-08-07 05:47:16 +00:00
# TODO: minor bug - skip sites requiring authentication without asking user input (e.g. ilab.usc.edu)
2012-08-06 22:24:50 +00:00
# TODO: minor bug - don't overwrite existing files with same filename in the same identifier
2012-08-07 05:47:16 +00:00
# TODO: trivial bug - check for duplicates with originalurl http://archive.org/details/wiki-enecgpediaorg http://archive.org/details/wiki-en.ecgpedia.org
# TODO: enhancement - download wiki logo and upload as *-logo.png, should suffice to get the thumb used
# TODO: enhancement - fix escapement? http://archive.org/details/wiki-encitizendiumorg
2012-04-20 20:19:40 +00:00
2012-04-20 20:57:36 +00:00
import os
import re
2012-04-20 20:37:31 +00:00
import subprocess
2012-08-03 11:07:29 +00:00
import sys
2012-08-06 12:12:37 +00:00
import time
2012-04-20 20:57:36 +00:00
import urllib
2012-08-06 12:12:37 +00:00
import urllib2
import dumpgenerator
2012-04-20 20:37:31 +00:00
2012-05-04 13:18:52 +00:00
# Configuration goes here
2012-08-06 12:57:41 +00:00
# You need a file named keys.txt with access and secret keys, in two different lines
2013-11-16 14:29:34 +00:00
accesskey = open ( ' keys.txt ' , ' r ' ) . readlines ( ) [ 0 ] . strip ( )
secretkey = open ( ' keys.txt ' , ' r ' ) . readlines ( ) [ 1 ] . strip ( )
2012-08-06 12:57:41 +00:00
collection = ' wikiteam ' # Replace with "opensource" if you are not an admin of the collection
2012-08-06 18:39:27 +00:00
# end configuration
2012-04-20 20:37:31 +00:00
2012-08-06 18:39:27 +00:00
# Nothing to change below
convertlang = { ' ar ' : ' Arabic ' , ' de ' : ' German ' , ' en ' : ' English ' , ' es ' : ' Spanish ' , ' fr ' : ' French ' , ' it ' : ' Italian ' , ' ja ' : ' Japanese ' , ' nl ' : ' Dutch ' , ' pl ' : ' Polish ' , ' pt ' : ' Portuguese ' , ' ru ' : ' Russian ' }
2012-08-06 15:57:11 +00:00
listfile = sys . argv [ 1 ]
uploadeddumps = [ ]
try :
uploadeddumps = [ l . split ( ' ; ' ) [ 1 ] for l in open ( ' uploader- %s .log ' % ( listfile ) , ' r ' ) . read ( ) . strip ( ) . splitlines ( ) ]
except :
pass
print ' %d dumps uploaded previously ' % ( len ( uploadeddumps ) )
2012-08-06 13:33:04 +00:00
def log ( wiki , dump , msg ) :
2012-08-06 12:57:41 +00:00
f = open ( ' uploader- %s .log ' % ( listfile ) , ' a ' )
2012-08-06 13:33:04 +00:00
f . write ( ' \n %s ; %s ; %s ' % ( wiki , dump , msg ) )
2012-08-06 12:57:41 +00:00
f . close ( )
2012-04-20 21:14:46 +00:00
def upload ( wikis ) :
2012-08-06 12:12:37 +00:00
for wiki in wikis :
print " # " * 73
print " # Uploading " , wiki
print " # " * 73
wiki = wiki . lower ( )
prefix = dumpgenerator . domain2prefix ( config = { ' api ' : wiki } )
2013-11-15 07:49:06 +00:00
2012-08-06 12:12:37 +00:00
wikiname = prefix . split ( ' - ' ) [ 0 ]
dumps = [ ]
for dirname , dirnames , filenames in os . walk ( ' . ' ) :
if dirname == ' . ' :
for f in filenames :
if f . startswith ( ' %s - ' % ( wikiname ) ) and ( f . endswith ( ' -wikidump.7z ' ) or f . endswith ( ' -history.xml.7z ' ) ) :
dumps . append ( f )
break
2012-04-20 21:32:08 +00:00
c = 0
2012-04-20 21:14:46 +00:00
for dump in dumps :
2012-08-06 15:57:11 +00:00
if dump in uploadeddumps :
2013-10-16 17:57:02 +00:00
print ' %s was uploaded before, skipping... ' % ( dump )
2012-08-06 15:57:11 +00:00
continue
2012-08-06 12:12:37 +00:00
time . sleep ( 0.1 )
wikidate = dump . split ( ' - ' ) [ 1 ]
2012-08-06 14:05:45 +00:00
wikidate_text = wikidate [ 0 : 4 ] + ' - ' + wikidate [ 4 : 6 ] + ' - ' + wikidate [ 6 : 8 ]
2012-04-20 21:14:46 +00:00
print wiki , wikiname , wikidate , dump
2012-08-06 12:12:37 +00:00
#get metadata from api.php
2012-08-06 13:33:04 +00:00
#first sitename and base url
2012-08-06 12:12:37 +00:00
headers = { ' User-Agent ' : dumpgenerator . getUserAgent ( ) }
2012-08-06 13:33:04 +00:00
params = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' xml ' }
2012-08-06 12:12:37 +00:00
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
2012-08-06 14:05:45 +00:00
xml = ' '
2012-08-06 12:12:37 +00:00
try :
f = urllib2 . urlopen ( req )
2012-08-06 14:05:45 +00:00
xml = f . read ( )
f . close ( )
2012-08-06 12:12:37 +00:00
except :
2012-08-06 14:05:45 +00:00
pass
2012-08-06 12:12:37 +00:00
sitename = ' '
2012-08-06 12:57:41 +00:00
baseurl = ' '
2012-08-06 18:39:27 +00:00
lang = ' '
2012-08-06 12:12:37 +00:00
try :
sitename = re . findall ( ur " sitename= \" ([^ \" ]+) \" " , xml ) [ 0 ]
2012-08-06 18:39:27 +00:00
except :
pass
try :
2012-08-06 13:33:04 +00:00
baseurl = re . findall ( ur " base= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
2012-08-06 18:39:27 +00:00
try :
lang = re . findall ( ur " lang= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
2012-08-06 13:33:04 +00:00
2012-08-06 14:05:45 +00:00
if not sitename :
sitename = wikiname
if not baseurl :
baseurl = re . sub ( ur " (?im)/api \ .php " , ur " " , wiki )
2012-08-06 18:39:27 +00:00
if lang :
lang = convertlang . has_key ( lang . lower ( ) ) and convertlang [ lang . lower ( ) ] or lang . lower ( )
2012-08-06 14:05:45 +00:00
2012-08-06 13:33:04 +00:00
#now copyright info from API
params = { ' action ' : ' query ' , ' siprop ' : ' general|rightsinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
try :
f = urllib2 . urlopen ( req )
xml = f . read ( )
f . close ( )
except :
pass
rightsinfourl = ' '
rightsinfotext = ' '
try :
2012-08-06 12:12:37 +00:00
rightsinfourl = re . findall ( ur " rightsinfo url= \" ([^ \" ]+) \" " , xml ) [ 0 ]
rightsinfotext = re . findall ( ur " text= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
2012-08-06 13:33:04 +00:00
#or copyright info from #footer in mainpage
2012-08-06 14:05:45 +00:00
if baseurl and not rightsinfourl and not rightsinfotext :
2012-08-06 15:41:50 +00:00
raw = ' '
try :
f = urllib . urlopen ( baseurl )
raw = f . read ( )
f . close ( )
except :
pass
2012-08-06 13:33:04 +00:00
rightsinfotext = ' '
rightsinfourl = ' '
try :
rightsinfourl = re . findall ( ur " <link rel= \" copyright \" href= \" ([^ \" ]+) \" /> " , raw ) [ 0 ]
except :
pass
try :
rightsinfotext = re . findall ( ur " <li id= \" copyright \" >([^ \n \r ]*?)</li> " , raw ) [ 0 ]
except :
pass
if rightsinfotext and not rightsinfourl :
rightsinfourl = baseurl + ' #footer '
2012-04-20 21:32:08 +00:00
#retrieve some info from the wiki
2012-08-06 12:12:37 +00:00
wikititle = " Wiki - %s " % ( sitename ) # Wiki - ECGpedia
2012-08-06 12:57:41 +00:00
wikidesc = " <a href= \" %s \" > %s </a> dumped with <a href= \" http://code.google.com/p/wikiteam/ \" rel= \" nofollow \" >WikiTeam</a> tools. " % ( baseurl , sitename ) # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools."
2012-08-06 12:12:37 +00:00
wikikeys = [ ' wiki ' , ' wikiteam ' , ' MediaWiki ' , sitename , wikiname ] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
2012-08-06 14:05:45 +00:00
if not rightsinfourl and not rightsinfotext :
wikikeys . append ( ' unknowncopyright ' )
2012-08-06 12:12:37 +00:00
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
2012-04-20 21:32:08 +00:00
#creates curl command
curl = [ ' curl ' , ' --location ' ,
2012-05-04 13:18:52 +00:00
' --header ' , " ' x-amz-auto-make-bucket:1 ' " , # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error
2012-04-22 09:00:19 +00:00
' --header ' , " ' x-archive-queue-derive:0 ' " ,
2012-04-20 21:32:08 +00:00
' --header ' , " ' x-archive-size-hint: %d ' " % ( os . path . getsize ( dump ) ) ,
' --header ' , " ' authorization: LOW %s : %s ' " % ( accesskey , secretkey ) ,
]
if c == 0 :
curl + = [ ' --header ' , " ' x-archive-meta-mediatype:web ' " ,
2012-05-11 03:56:27 +00:00
' --header ' , " ' x-archive-meta-collection: %s ' " % ( collection ) ,
2012-04-20 21:32:08 +00:00
' --header ' , " ' x-archive-meta-title: %s ' " % ( wikititle ) ,
' --header ' , " ' x-archive-meta-description: %s ' " % ( wikidesc ) ,
2012-08-06 18:39:27 +00:00
' --header ' , " ' x-archive-meta-language: %s ' " % ( lang ) ,
2012-08-06 14:07:46 +00:00
' --header ' , " ' x-archive-meta-last-updated-date: %s ' " % ( wikidate_text ) ,
2012-05-04 13:18:52 +00:00
' --header ' , " ' x-archive-meta-subject: %s ' " % ( ' ; ' . join ( wikikeys ) ) , # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
2012-04-20 21:32:08 +00:00
' --header ' , " ' x-archive-meta-licenseurl: %s ' " % ( wikilicenseurl ) ,
' --header ' , " ' x-archive-meta-rights: %s ' " % ( wikirights ) ,
' --header ' , " ' x-archive-meta-originalurl: %s ' " % ( wikiurl ) ,
]
curl + = [ ' --upload-file ' , " %s " % ( dump ) ,
2013-11-23 12:19:46 +00:00
" http://s3.us.archive.org/wiki- %s / %s " % ( wikiname , dump ) , # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3.
2012-04-20 21:32:08 +00:00
]
2012-08-03 11:07:29 +00:00
curlline = ' ' . join ( curl )
os . system ( curlline )
2012-04-20 21:32:08 +00:00
c + = 1
2012-08-06 15:57:11 +00:00
uploadeddumps . append ( dump )
2012-08-06 13:33:04 +00:00
log ( wiki , dump , ' ok ' )
2012-04-20 20:57:36 +00:00
def main ( ) :
2012-08-06 13:33:04 +00:00
wikis = open ( listfile , ' r ' ) . read ( ) . strip ( ) . splitlines ( )
2012-04-20 21:14:46 +00:00
upload ( wikis )
2012-04-20 20:57:36 +00:00
if __name__ == " __main__ " :
main ( )