wikiteam/commonsdownloader.py

#!/usr/bin/python
# -*- coding: utf8 -*-

# Copyright (C) 2012 WikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import csv
import datetime
import md5
import os
import re
import sys

filename = 'commonssql.csv'
startdate = ''
enddate = ''
delta = datetime.timedelta(days=1)
if len(sys.argv) == 1:
    print 'Usage: python script.py 2005-01-01 2005-01-10 [to download the first 10 days of 2005]'
    sys.exit()
elif len(sys.argv) == 2:
    startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d')
    enddate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d')
elif len(sys.argv) == 3:
    startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d')
    enddate = datetime.datetime.strptime(sys.argv[2], '%Y-%m-%d')

print "Downloading Wikimedia Commons images from %s to %s" % (startdate.strftime('%Y-%m-%d'), enddate.strftime('%Y-%m-%d'))
while startdate <= enddate:
    print '==', startdate.strftime('%Y-%m-%d'), '=='
    path = startdate.strftime('%Y/%m/%d')
    filename7z = startdate.strftime('%Y-%m-%d.7z')
    try:
        os.makedirs(path)
    except:
        pass
    c = 1
    f = csv.reader(open(filename, 'r'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height in f:
        if c != 1:
            img_name = unicode(img_name, 'utf-8')
            original_name = img_name
            if re.search(ur"(?m)^\d{14}\!", original_name):#removing XXXXXXX! from name if present
                original_name = original_name[15:]
            img_user_text = unicode(img_user_text, 'utf-8')
            if img_timestamp.startswith(startdate.strftime('%Y%m%d')):
                img_name_ = re.sub(r'"', r'\"', re.sub(r' ', r'_', img_name.encode('utf-8'))) # do not use u'', it is encoded
                print img_name, img_name_, img_timestamp
                md5_ = md5.new(re.sub(' ', '_', original_name.encode("utf-8"))).hexdigest() # do not use img_name_, md5 needs the original name without \"
                os.system('wget -c "http://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s/%s"' % (md5_[0], md5_[0:2], img_name_, path, img_name_))
                os.system('curl -d "&pages=File:%s&history=1&action=submit" http://commons.wikimedia.org/w/index.php?title=Special:Export -o "%s/%s.desc"' % (img_name_, path, img_name_))
        c += 1
    #7z
    os.system('7z a %s %s' % (filename7z, path))
    startdate += delta
first version for Wikimedia Commons images downloader GO GO GO git-svn-id: https://wikiteam.googlecode.com/svn/trunk@264 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#!/usr/bin/python`
			`# -- coding: utf8 --`

wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`# Copyright (C) 2012 WikiTeam`
first version for Wikimedia Commons images downloader GO GO GO git-svn-id: https://wikiteam.googlecode.com/svn/trunk@264 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`import csv`
			`import datetime`
			`import md5`
			`import os`
first version for Wikimedia Commons images downloader GO GO GO git-svn-id: https://wikiteam.googlecode.com/svn/trunk@264 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import re`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`import sys`
first version for Wikimedia Commons images downloader GO GO GO git-svn-id: https://wikiteam.googlecode.com/svn/trunk@264 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`filename = 'commonssql.csv'`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@331 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`startdate = ''`
			`enddate = ''`
			`delta = datetime.timedelta(days=1)`
			`if len(sys.argv) == 1:`
			`print 'Usage: python script.py 2005-01-01 2005-01-10 [to download the first 10 days of 2005]'`
			`sys.exit()`
			`elif len(sys.argv) == 2:`
			`startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d')`
			`enddate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d')`
			`elif len(sys.argv) == 3:`
			`startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d')`
			`enddate = datetime.datetime.strptime(sys.argv[2], '%Y-%m-%d')`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@331 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`print "Downloading Wikimedia Commons images from %s to %s" % (startdate.strftime('%Y-%m-%d'), enddate.strftime('%Y-%m-%d'))`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`while startdate <= enddate:`
			`print '==', startdate.strftime('%Y-%m-%d'), '=='`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@330 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`path = startdate.strftime('%Y/%m/%d')`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@331 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`filename7z = startdate.strftime('%Y-%m-%d.7z')`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@330 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`try:`
			`os.makedirs(path)`
			`except:`
			`pass`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`c = 1`
			`f = csv.reader(open(filename, 'r'), delimiter='\|', quotechar='"', quoting=csv.QUOTE_MINIMAL)`
			`for img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height in f:`
			`if c != 1:`
			`img_name = unicode(img_name, 'utf-8')`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@332 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`original_name = img_name`
			`if re.search(ur"(?m)^\d{14}\!", original_name):#removing XXXXXXX! from name if present`
			`original_name = original_name[15:]`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`img_user_text = unicode(img_user_text, 'utf-8')`
			`if img_timestamp.startswith(startdate.strftime('%Y%m%d')):`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@330 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`img_name_ = re.sub(r'"', r'\"', re.sub(r' ', r'_', img_name.encode('utf-8'))) # do not use u'', it is encoded`
			`print img_name, img_name_, img_timestamp`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@332 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`md5_ = md5.new(re.sub(' ', '_', original_name.encode("utf-8"))).hexdigest() # do not use img_name_, md5 needs the original name without \"`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@330 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`os.system('wget -c "http://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s/%s"' % (md5_[0], md5_[0:2], img_name_, path, img_name_))`
			`os.system('curl -d "&pages=File:%s&history=1&action=submit" http://commons.wikimedia.org/w/index.php?title=Special:Export -o "%s/%s.desc"' % (img_name_, path, img_name_))`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`c += 1`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@331 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`#7z`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@332 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`os.system('7z a %s %s' % (filename7z, path))`
wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 12 years ago			`startdate += delta`
first version for Wikimedia Commons images downloader GO GO GO git-svn-id: https://wikiteam.googlecode.com/svn/trunk@264 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago