From b80d988831587456655b2688f65bcc569cd393cd Mon Sep 17 00:00:00 2001 From: emijrp Date: Mon, 27 Feb 2012 15:39:55 +0000 Subject: [PATCH] wikimedia commons downloader git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- commonsdownloader.py | 55 +++++++++++++++++++-------------------- commonssql.py | 59 ++++++++++++++++++++++++++++++++++++++++++ editthis/editthis.info | 1 - 3 files changed, 85 insertions(+), 30 deletions(-) create mode 100644 commonssql.py diff --git a/commonsdownloader.py b/commonsdownloader.py index 20bca4f..76b380d 100644 --- a/commonsdownloader.py +++ b/commonsdownloader.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf8 -*- -# Copyright (C) 2011 WikiTeam +# Copyright (C) 2012 WikiTeam # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -15,35 +15,32 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import urllib -import sys +import csv +import datetime +import md5 +import os import re -import codecs +import sys -""" -recibe un argumento: 2005 (baja todo el año), 2005-01 (todo el mes), 2005-01-01 (un solo día), pero siempre organiza en directorios 2005 |_ 2005-01 |_ 2005-01-01, etc +startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d') +enddate = datetime.datetime.strptime(sys.argv[2], '%Y-%m-%d') +delta = datetime.timedelta(days=1) +filename = 'commonssql.csv' -http://www.mediawiki.org/wiki/Manual:Oldimage_table +while startdate <= enddate: + print '==', startdate.strftime('%Y-%m-%d'), '==' + c = 1 + f = csv.reader(open(filename, 'r'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) + for img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height in f: + if c != 1: + img_name = unicode(img_name, 'utf-8') + img_user_text = unicode(img_user_text, 'utf-8') + if img_timestamp.startswith(startdate.strftime('%Y%m%d')): + print img_name.encode('utf-8'), img_timestamp + img_name_ = re.sub('"', '\"', img_name) + md5_ = md5.new(img_name.encode('utf-8')).hexdigest() + #os.system('wget -c "http://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s' % (md5_[0], md5_[0:2], img_name, img_name_)) + #os.system('curl -d "&pages=File:%s&history=1&action=submit" http://commons.wikimedia.org/w/index.php?title=Special:Export -o "%s.desc' % (img_name_, img_name_)) + c += 1 + startdate += delta -substr para el X y XY del md5 http://dev.mysql.com/doc/refman/5.0/en/string-functions.html#function_substr - -mysql -h commons-p.db.toolserver.org -e "use commonswiki_p;select oi_archive_name, oi_timestamp from oldimage where 1;" > test.txt -""" - -def getUserAgent(): - """ Return a cool user-agent to hide Python user-agent """ - useragents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'] - return useragents[0] - -class AppURLopener(urllib.FancyURLopener): - version = getUserAgent() - -urllib._urlopener = AppURLopener() - -f = open('image.list', 'r') -l = f.read().splitlines() -f.close() - -for i in l: - i=urllib.unquote(i) - urllib.urlretrieve(i, i.split('/')[-1]) diff --git a/commonssql.py b/commonssql.py new file mode 100644 index 0000000..a443b8a --- /dev/null +++ b/commonssql.py @@ -0,0 +1,59 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +# Copyright (C) 2012 WikiTeam +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import csv +import MySQLdb +import re + +filename = 'commonssql.csv' +f = open(filename, 'w') +f.write('img_name|img_timestamp|img_user|img_user_text|img_size|img_width|img_height\n') +f.close() + +#http://www.mediawiki.org/wiki/Manual:Image_table +#http://www.mediawiki.org/wiki/Manual:Oldimage_table +queries = [ + "SELECT /* commonssql.py SLOW_OK */ img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height FROM image WHERE img_timestamp<20070101000000 ORDER BY img_timestamp ASC", + "SELECT /* commonssql.py SLOW_OK */ oi_archive_name AS img_name, oi_timestamp AS img_timestamp, oi_user AS img_user, oi_user_text AS img_user_text, oi_size AS img_size, oi_width AS img_width, oi_height AS img_height FROM oldimage WHERE oi_deleted=0 AND oi_timestamp<20070101000000 ORDER BY oi_timestamp ASC" #do not get unavailable images +] + +f = csv.writer(open(filename, 'a'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) +conn = MySQLdb.connect(host='sql-s4', db='commonswiki_p', read_default_file='~/.my.cnf') +for query in queries: + conn.query(query) + r = conn.store_result() + c = 0 + row = r.fetch_row(maxrows=1, how=1) + rows = [] + while row: + if len(row) == 1: + img_name = re.sub(u' ', u'_', unicode(row[0]['img_name'], 'utf-8')) + img_timestamp = row[0]['img_timestamp'] + img_user = row[0]['img_user'] + img_user_text = re.sub(u' ', u'_', unicode(row[0]['img_user_text'], 'utf-8')) + img_size = row[0]['img_size'] + img_width = row[0]['img_width'] + img_height = row[0]['img_height'] + + rows.append([img_name.encode('utf-8'), img_timestamp, img_user, img_user_text.encode('utf-8'), img_size, img_width, img_height]) + c += 1 + if c % 10000 == 0: + print c + f.writerows(rows) + rows = [] + row = r.fetch_row(maxrows=1, how=1) + f.writerows(rows) diff --git a/editthis/editthis.info b/editthis/editthis.info index 73ecd58..b0f5f8b 100644 --- a/editthis/editthis.info +++ b/editthis/editthis.info @@ -1,4 +1,3 @@ -http://www.editthis.info/1337 http://www.editthis.info/1742_ http://www.editthis.info/1t0meds http://www.editthis.info/20803