From 93f7ecf155e11365d809e29661e929c12828cf77 Mon Sep 17 00:00:00 2001 From: emijrp Date: Wed, 3 Aug 2016 00:29:29 +0200 Subject: [PATCH] own directory for Wikimedia Commons, fixing commonssql.py db query --- commonssql.py | 61 ----------------- .../commonschecker.py | 0 .../commonsdownloader.py | 0 wikimediacommons/commonssql.py | 66 +++++++++++++++++++ 4 files changed, 66 insertions(+), 61 deletions(-) delete mode 100644 commonssql.py rename commonschecker.py => wikimediacommons/commonschecker.py (100%) rename commonsdownloader.py => wikimediacommons/commonsdownloader.py (100%) create mode 100644 wikimediacommons/commonssql.py diff --git a/commonssql.py b/commonssql.py deleted file mode 100644 index 8b65eea..0000000 --- a/commonssql.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf8 -*- - -# Copyright (C) 2012 WikiTeam -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import csv -import MySQLdb -import re -import sys - -filename = 'commonssql.csv' -f = open(filename, 'w') -f.write('img_name|img_timestamp|img_user|img_user_text|img_size|img_width|img_height\n') -f.close() - -#http://www.mediawiki.org/wiki/Manual:Image_table -#http://www.mediawiki.org/wiki/Manual:Oldimage_table -year = int(sys.argv[1]) -queries = [ - "SELECT /* commonssql.py SLOW_OK */ img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height FROM image WHERE img_timestamp>=%d0101000000 AND img_timestamp<=%d1231235959 ORDER BY img_timestamp ASC" % (year, year), - "SELECT /* commonssql.py SLOW_OK */ oi_archive_name AS img_name, oi_timestamp AS img_timestamp, oi_user AS img_user, oi_user_text AS img_user_text, oi_size AS img_size, oi_width AS img_width, oi_height AS img_height FROM oldimage WHERE oi_deleted=0 AND oi_timestamp>=%d0101000000 AND oi_timestamp<=%d1231235959 ORDER BY oi_timestamp ASC" % (year, year), #do not get unavailable images -] - -f = csv.writer(open(filename, 'a'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) -conn = MySQLdb.connect(host='commonswiki.labsdb', db='commonswiki_p', read_default_file='~/.my.cnf') -for query in queries: - conn.query(query) - r = conn.store_result() - c = 0 - row = r.fetch_row(maxrows=1, how=1) - rows = [] - while row: - if len(row) == 1: - img_name = re.sub(u' ', u'_', unicode(row[0]['img_name'], 'utf-8')) - img_timestamp = row[0]['img_timestamp'] - img_user = row[0]['img_user'] - img_user_text = re.sub(u' ', u'_', unicode(row[0]['img_user_text'], 'utf-8')) - img_size = row[0]['img_size'] - img_width = row[0]['img_width'] - img_height = row[0]['img_height'] - - rows.append([img_name.encode('utf-8'), img_timestamp, img_user, img_user_text.encode('utf-8'), img_size, img_width, img_height]) - c += 1 - if c % 10000 == 0: - print c - f.writerows(rows) - rows = [] - row = r.fetch_row(maxrows=1, how=1) - f.writerows(rows) diff --git a/commonschecker.py b/wikimediacommons/commonschecker.py similarity index 100% rename from commonschecker.py rename to wikimediacommons/commonschecker.py diff --git a/commonsdownloader.py b/wikimediacommons/commonsdownloader.py similarity index 100% rename from commonsdownloader.py rename to wikimediacommons/commonsdownloader.py diff --git a/wikimediacommons/commonssql.py b/wikimediacommons/commonssql.py new file mode 100644 index 0000000..830a709 --- /dev/null +++ b/wikimediacommons/commonssql.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python2 +# -*- coding: utf8 -*- + +# Copyright (C) 2012-2016 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import csv +import MySQLdb +import re +import sys + +def main(): + year = int(sys.argv[1]) + filename = 'commonssql-%s.csv' % (year) + f = open(filename, 'w') + f.write('img_name|img_timestamp|img_user|img_user_text|img_size|img_width|img_height\n') + f.close() + + #http://www.mediawiki.org/wiki/Manual:Image_table + #http://www.mediawiki.org/wiki/Manual:Oldimage_table + + queries = [ + "SELECT /* commonssql.py SLOW_OK */ img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height FROM image WHERE img_timestamp>=%d0101000000 AND img_timestamp<=%d1231235959 ORDER BY img_timestamp ASC" % (year, year), + "SELECT /* commonssql.py SLOW_OK */ oi_archive_name AS img_name, oi_timestamp AS img_timestamp, oi_user AS img_user, oi_user_text AS img_user_text, oi_size AS img_size, oi_width AS img_width, oi_height AS img_height FROM oldimage WHERE oi_deleted=0 AND oi_timestamp>=%d0101000000 AND oi_timestamp<=%d1231235959 ORDER BY oi_timestamp ASC" % (year, year), #do not get unavailable images + ] + + f = csv.writer(open(filename, 'a'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) + conn = MySQLdb.connect(host='s4.labsdb', db='commonswiki_p', read_default_file='~/replica.my.cnf', use_unicode=True) + for query in queries: + conn.query(query) + r = conn.store_result() + c = 0 + row = r.fetch_row(maxrows=1, how=1) + rows = [] + while row: + if len(row) == 1: + img_name = re.sub(' ', '_', row[0]['img_name']) + img_timestamp = row[0]['img_timestamp'] + img_user = row[0]['img_user'] + img_user_text = re.sub(' ', '_', row[0]['img_user_text']) + img_size = row[0]['img_size'] + img_width = row[0]['img_width'] + img_height = row[0]['img_height'] + + rows.append([img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height]) + c += 1 + if c % 10000 == 0: + print(c) + f.writerows(rows) + rows = [] + row = r.fetch_row(maxrows=1, how=1) + f.writerows(rows) + +if __name__ == '__main__': + main()