wikimedia commons downloader

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@329 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
2024-11-12 07:12:41 +00:00 · 2012-02-27 15:39:55 +00:00 · 2012-02-27 15:39:55 +00:00 · b80d988831
commit b80d988831
parent dd0bcf5d8f
3 changed files with 85 additions and 30 deletions
--- a/commonsdownloader.py
+++ b/commonsdownloader.py
@ -1,7 +1,7 @@
 #!/usr/bin/python
 # -*- coding: utf8 -*-

-# Copyright (C) 2011 WikiTeam
+# Copyright (C) 2012 WikiTeam
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -15,35 +15,32 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

-import urllib
-import sys
+import csv
+import datetime
+import md5
+import os
 import re
-import codecs
+import sys

-"""
-recibe un argumento: 2005 (baja todo el año), 2005-01 (todo el mes), 2005-01-01 (un solo día), pero siempre organiza en directorios 2005 |_ 2005-01 |_ 2005-01-01, etc
+startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d')
+enddate = datetime.datetime.strptime(sys.argv[2], '%Y-%m-%d')
+delta = datetime.timedelta(days=1)
+filename = 'commonssql.csv'

-http://www.mediawiki.org/wiki/Manual:Oldimage_table
+while startdate <= enddate:
+    print '==', startdate.strftime('%Y-%m-%d'), '=='
+    c = 1
+    f = csv.reader(open(filename, 'r'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+    for img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height in f:
+        if c != 1:
+            img_name = unicode(img_name, 'utf-8')
+            img_user_text = unicode(img_user_text, 'utf-8')
+            if img_timestamp.startswith(startdate.strftime('%Y%m%d')):
+                print img_name.encode('utf-8'), img_timestamp
+                img_name_ = re.sub('"', '\"', img_name)
+                md5_ = md5.new(img_name.encode('utf-8')).hexdigest()
+                #os.system('wget -c "http://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s' % (md5_[0], md5_[0:2], img_name, img_name_))
+                #os.system('curl -d "&pages=File:%s&history=1&action=submit" http://commons.wikimedia.org/w/index.php?title=Special:Export -o "%s.desc' % (img_name_, img_name_))
+        c += 1
+    startdate += delta

-substr para el X y XY del md5 http://dev.mysql.com/doc/refman/5.0/en/string-functions.html#function_substr
-
-mysql -h commons-p.db.toolserver.org -e "use commonswiki_p;select oi_archive_name, oi_timestamp from oldimage where 1;" > test.txt
-"""
-
-def getUserAgent():
-    """ Return a cool user-agent to hide Python user-agent """
-    useragents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4']
-    return useragents[0]
-
-class AppURLopener(urllib.FancyURLopener):
-    version = getUserAgent()
-
-urllib._urlopener = AppURLopener()
-
-f = open('image.list', 'r')
-l = f.read().splitlines()
-f.close()
-
-for i in l:
-    i=urllib.unquote(i)
-    urllib.urlretrieve(i, i.split('/')[-1])
--- a/commonssql.py
+++ b/commonssql.py
@ -0,0 +1,59 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+# Copyright (C) 2012 WikiTeam
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import csv
+import MySQLdb
+import re
+
+filename = 'commonssql.csv'
+f = open(filename, 'w')
+f.write('img_name|img_timestamp|img_user|img_user_text|img_size|img_width|img_height\n')
+f.close()
+
+#http://www.mediawiki.org/wiki/Manual:Image_table
+#http://www.mediawiki.org/wiki/Manual:Oldimage_table
+queries = [
+    "SELECT /* commonssql.py SLOW_OK */ img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height FROM image WHERE img_timestamp<20070101000000 ORDER BY img_timestamp ASC",
+    "SELECT /* commonssql.py SLOW_OK */ oi_archive_name AS img_name, oi_timestamp AS img_timestamp, oi_user AS img_user, oi_user_text AS img_user_text, oi_size AS img_size, oi_width AS img_width, oi_height AS img_height FROM oldimage WHERE oi_deleted=0 AND oi_timestamp<20070101000000 ORDER BY oi_timestamp ASC" #do not get unavailable images
+]
+
+f = csv.writer(open(filename, 'a'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+conn = MySQLdb.connect(host='sql-s4', db='commonswiki_p', read_default_file='~/.my.cnf')
+for query in queries:
+    conn.query(query)
+    r = conn.store_result()
+    c = 0
+    row = r.fetch_row(maxrows=1, how=1)
+    rows = []
+    while row:
+        if len(row) == 1:
+            img_name = re.sub(u' ', u'_', unicode(row[0]['img_name'], 'utf-8'))
+            img_timestamp = row[0]['img_timestamp']
+            img_user = row[0]['img_user']
+            img_user_text = re.sub(u' ', u'_', unicode(row[0]['img_user_text'], 'utf-8'))
+            img_size = row[0]['img_size']
+            img_width = row[0]['img_width']
+            img_height = row[0]['img_height']
+            
+            rows.append([img_name.encode('utf-8'), img_timestamp, img_user, img_user_text.encode('utf-8'), img_size, img_width, img_height])
+            c += 1
+            if c % 10000 == 0:
+                print c
+                f.writerows(rows)
+                rows = []
+        row = r.fetch_row(maxrows=1, how=1)
+    f.writerows(rows)
--- a/editthis/editthis.info
+++ b/editthis/editthis.info
@ -1,4 +1,3 @@
-http://www.editthis.info/1337
 http://www.editthis.info/1742_
 http://www.editthis.info/1t0meds
 http://www.editthis.info/20803