wikimedia commons downloader

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@335 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
pull/117/head
emijrp 12 years ago
parent a16a121fbc
commit 6b0d2ec64d

@ -23,6 +23,7 @@ import re
import sys
filename = 'commonssql.csv'
filename = 'a.csv'
startdate = ''
enddate = ''
delta = datetime.timedelta(days=1)
@ -61,6 +62,14 @@ while startdate <= enddate:
md5_ = md5.new(re.sub(' ', '_', original_name.encode("utf-8"))).hexdigest() # do not use img_name_, md5 needs the original name without \"
if original_name != img_name:
os.system('wget -c "http://upload.wikimedia.org/wikipedia/commons/archive/%s/%s/%s" -O "%s/%s"' % (md5_[0], md5_[0:2], img_name_, path, img_name_))
if not os.path.getsize('%s/%s' % (path, img_name_)): #empty file, false XXXXXX! begining? restore original_name to ! version
print 'NOO'
#recalculate md5 and other variables that use original_name as source
original_name = img_name
original_name_ = re.sub(r'"', r'\"', re.sub(r' ', r'_', original_name.encode('utf-8')))
md5_ = md5.new(re.sub(' ', '_', original_name.encode("utf-8"))).hexdigest()
#redownload, now without /archive/ subpath
os.system('wget -c "http://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s/%s"' % (md5_[0], md5_[0:2], img_name_, path, img_name_))
else:
os.system('wget -c "http://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s/%s"' % (md5_[0], md5_[0:2], img_name_, path, img_name_))
os.system('curl -d "&pages=File:%s&history=1&action=submit" http://commons.wikimedia.org/w/index.php?title=Special:Export -o "%s/%s.desc"' % (original_name_, path, img_name_))

Loading…
Cancel
Save