2
0
mirror of https://github.com/WikiTeam/wikiteam synced 2024-11-15 00:15:00 +00:00
wikiteam/wikimediacommons/commons-update-status.py
2016-08-03 16:45:54 +02:00

66 lines
2.7 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf8 -*-
# Copyright (C) 2012-2016 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import json
import urllib
def main():
queryurl = 'https://archive.org/advancedsearch.php?q=collection%3Awikimediacommons&fl[]=identifier&sort[]=&sort[]=&sort[]=&rows=1000&page=1&output=json&callback=callback'
raw = urllib.urlopen(queryurl).read()
raw = raw.split('callback(')[1].strip(')')
result = json.loads(raw)['response']['docs']
identifiers = {}
for item in result:
identifier = item['identifier']
if 'wikimediacommons-20' in identifier:
date = identifier.split('wikimediacommons-')[1]
t = date.split('-')
if len(t) == 1:
if len(t[0]) == 4: # YYYY
identifiers[t[0]] = identifier
elif len(t[0]) == 6: # YYYYMM
identifiers['%s-%s' % (t[0][:4], t[0][4:6])] = identifier
elif len(t[0]) == 8: # YYYYMMDD
identifiers['%s-%s-%s' % (t[0][:4], t[0][4:6], t[0][6:8])] = identifier
else:
print('ERROR, dont understand date format in %s' % (identifier))
elif len(t) == 2:
if len(t[0]) == 4 and len(t[1]) == 2: #YYYY-MM
identifiers['%s-%s' % (t[0], t[1])] = identifier
else:
print('ERROR, dont understand date format in %s' % (identifier))
elif len(t) == 3:
if len(t[0]) == 4 and len(t[1]) == 2 and len(t[2]) == 2: #YYYY-MM-DD
identifiers['%s-%s-%s' % (t[0], t[1], t[2])] = identifier
else:
print('ERROR, dont understand date format in %s' % (identifier))
identifiers_list = [[k, v] for k, v in identifiers.items()]
identifiers_list.sort()
rows = ["|-\n| %s || [https://archive.org/details/%s %s] || ??? || ???" % (k, v, v) for k, v in identifiers_list]
output = """
{| class="wikitable sortable"
! Date !! Identifier !! Files !! Size (GB)
%s
|}""" % ('\n'.join(rows))
print(output)
if __name__ == '__main__':
main()