adding support to download images on old mediawikis; regexp4;

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@165 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
pull/117/head
emijrp 13 years ago
parent 5b7674edb7
commit 778a8ad7ae

@ -417,7 +417,7 @@ def getImageFilenamesURL(config={}):
images = []
offset = '29990101000000' #january 1, 2999
while offset:
url = '%s?title=Special:Imagelist&limit=500&offset=%s' % (config['index'], offset) #5000 overload some servers
url = '%s?title=Special:Imagelist&limit=5000&offset=%s' % (config['index'], offset) #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
#print url
raw = urllib.urlopen(url).read()
raw = cleanHTML(raw)
@ -428,6 +428,9 @@ def getImageFilenamesURL(config={}):
r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
#gentoowiki 1.18 <tr><td class="TablePager_col_img_timestamp">18:15, 3 April 2011</td><td class="TablePager_col_img_name"><a href="/wiki/File:Asus_eeepc-1201nl.png" title="File:Asus eeepc-1201nl.png">Asus eeepc-1201nl.png</a> (<a href="/w/images/2/2b/Asus_eeepc-1201nl.png">file</a>)</td><td class="TablePager_col_thumb"><a href="/wiki/File:Asus_eeepc-1201nl.png" class="image"><img alt="" src="/w/images/thumb/2/2b/Asus_eeepc-1201nl.png/180px-Asus_eeepc-1201nl.png" width="180" height="225" /></a></td><td class="TablePager_col_img_size">37 KB</td><td class="TablePager_col_img_user_text"><a href="/w/index.php?title=User:Yannails&amp;action=edit&amp;redlink=1" class="new" title="User:Yannails (page does not exist)">Yannails</a></td><td class="TablePager_col_img_description">&#160;</td><td class="TablePager_col_count">1</td></tr>
r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
#http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
#(<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
m = []
#different mediawiki versions
if re.search(r_images1, raw):
@ -436,6 +439,8 @@ def getImageFilenamesURL(config={}):
m = re.compile(r_images2).finditer(raw)
elif re.search(r_images3, raw):
m = re.compile(r_images3).finditer(raw)
elif re.search(r_images4, raw):
m = re.compile(r_images4).finditer(raw)
for i in m:
url = i.group('url')

@ -80,3 +80,6 @@ http://wiki.frema.ecs.soton.ac.uk/api.php
http://wikitravel.org/wiki/hu/api.php
http://wiki.freeculture.org/api.php
http://loprometidoesdeuda.com/api.php
http://es.tanatopedia.net/api.php
http://www.rezeptewiki.org/api.php
http://www.tarracowiki.cat/tarracowiki/api.php

Loading…
Cancel
Save