#!/usr/bin/python
# -*- coding: utf8 -*-
# Copyright (C) 2012 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv
import datetime
import md5
import os
import re
import sys
filename = ' commonssql.csv '
startdate = ' '
enddate = ' '
delta = datetime . timedelta ( days = 1 )
if len ( sys . argv ) == 1 :
print ' Usage: python script.py 2005-01-01 2005-01-10 [to download the first 10 days of 2005] '
sys . exit ( )
elif len ( sys . argv ) == 2 :
startdate = datetime . datetime . strptime ( sys . argv [ 1 ] , ' % Y- % m- %d ' )
enddate = datetime . datetime . strptime ( sys . argv [ 1 ] , ' % Y- % m- %d ' )
elif len ( sys . argv ) == 3 :
startdate = datetime . datetime . strptime ( sys . argv [ 1 ] , ' % Y- % m- %d ' )
enddate = datetime . datetime . strptime ( sys . argv [ 2 ] , ' % Y- % m- %d ' )
else :
sys . exit ( )
print " Downloading Wikimedia Commons files from %s to %s " % ( startdate . strftime ( ' % Y- % m- %d ' ) , enddate . strftime ( ' % Y- % m- %d ' ) )
while startdate < = enddate :
print ' == ' , startdate . strftime ( ' % Y- % m- %d ' ) , ' == '
path = startdate . strftime ( ' % Y/ % m/ %d ' )
filenamezip = startdate . strftime ( ' % Y- % m- %d .zip ' )
try :
os . makedirs ( path )
except :
pass
c = 1
f = csv . reader ( open ( filename , ' r ' ) , delimiter = ' | ' , quotechar = ' " ' , quoting = csv . QUOTE_MINIMAL )
for img_name , img_timestamp , img_user , img_user_text , img_size , img_width , img_height in f :
if c != 1 :
img_name = unicode ( img_name , ' utf-8 ' )
original_name = img_name
if re . search ( ur " (?m)^ \ d {14} \ ! " , original_name ) : #removing XXXXXXX! from name if present
original_name = original_name [ 15 : ]
img_user_text = unicode ( img_user_text , ' utf-8 ' )
if img_timestamp . startswith ( startdate . strftime ( ' % Y % m %d ' ) ) :
original_name_ = re . sub ( r ' " ' , r ' \ " ' , re . sub ( r ' ' , r ' _ ' , original_name . encode ( ' utf-8 ' ) ) ) # do not use ur'', it is encoded
img_name_ = re . sub ( r ' " ' , r ' \ " ' , re . sub ( r ' ' , r ' _ ' , img_name . encode ( ' utf-8 ' ) ) ) # do not use ur'', it is encoded
print img_name , img_name_ , img_timestamp
md5_ = md5 . new ( re . sub ( ' ' , ' _ ' , original_name . encode ( " utf-8 " ) ) ) . hexdigest ( ) # do not use img_name_, md5 needs the original name without \"
if original_name != img_name :
os . system ( ' wget -c " http://upload.wikimedia.org/wikipedia/commons/archive/ %s / %s / %s " -O " %s / %s " ' % ( md5_ [ 0 ] , md5_ [ 0 : 2 ] , img_name_ , path , img_name_ ) )
if not os . path . getsize ( ' %s / %s ' % ( path , img_name_ ) ) : #empty file?, false XXXXXX! begining like this http://commons.wikimedia.org/wiki/File:20041028210012!Pilar.jpg ? ok, restore original_name to ! version
#recalculate md5 and other variables that use original_name as source
original_name = img_name
original_name_ = re . sub ( r ' " ' , r ' \ " ' , re . sub ( r ' ' , r ' _ ' , original_name . encode ( ' utf-8 ' ) ) )
md5_ = md5 . new ( re . sub ( ' ' , ' _ ' , original_name . encode ( " utf-8 " ) ) ) . hexdigest ( )
#redownload, now without /archive/ subpath
os . system ( ' wget -c " http://upload.wikimedia.org/wikipedia/commons/ %s / %s / %s " -O " %s / %s " ' % ( md5_ [ 0 ] , md5_ [ 0 : 2 ] , img_name_ , path , img_name_ ) )
else :
os . system ( ' wget -c " http://upload.wikimedia.org/wikipedia/commons/ %s / %s / %s " -O " %s / %s " ' % ( md5_ [ 0 ] , md5_ [ 0 : 2 ] , img_name_ , path , img_name_ ) )
os . system ( ' curl -d " &pages=File: %s &history=1&action=submit " http://commons.wikimedia.org/w/index.php?title=Special:Export -o " %s / %s .desc " ' % ( original_name_ , path , img_name_ ) )
c + = 1
#zip files
os . system ( ' zip -9 %s %s /* ' % ( filenamezip , path ) )
startdate + = delta