@ -417,7 +417,7 @@ def getImageFilenamesURL(config={}):
images = [ ]
offset = ' 29990101000000 ' #january 1, 2999
while offset :
url = ' %s ?title=Special:Imagelist&limit=500 &offset=%s ' % ( config [ ' index ' ] , offset ) #5000 overload some servers
url = ' %s ?title=Special:Imagelist&limit=500 0 &offset=%s ' % ( config [ ' index ' ] , offset ) #5000 overload some servers , but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
#print url
raw = urllib . urlopen ( url ) . read ( )
raw = cleanHTML ( raw )
@ -428,6 +428,9 @@ def getImageFilenamesURL(config={}):
r_images2 = r ' (?im)<td class= " TablePager_col_links " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a></td> \ s*<td class= " TablePager_col_img_timestamp " >[^<]+</td> \ s*<td class= " TablePager_col_img_name " >[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
#gentoowiki 1.18 <tr><td class="TablePager_col_img_timestamp">18:15, 3 April 2011</td><td class="TablePager_col_img_name"><a href="/wiki/File:Asus_eeepc-1201nl.png" title="File:Asus eeepc-1201nl.png">Asus eeepc-1201nl.png</a> (<a href="/w/images/2/2b/Asus_eeepc-1201nl.png">file</a>)</td><td class="TablePager_col_thumb"><a href="/wiki/File:Asus_eeepc-1201nl.png" class="image"><img alt="" src="/w/images/thumb/2/2b/Asus_eeepc-1201nl.png/180px-Asus_eeepc-1201nl.png" width="180" height="225" /></a></td><td class="TablePager_col_img_size">37 KB</td><td class="TablePager_col_img_user_text"><a href="/w/index.php?title=User:Yannails&action=edit&redlink=1" class="new" title="User:Yannails (page does not exist)">Yannails</a></td><td class="TablePager_col_img_description"> </td><td class="TablePager_col_count">1</td></tr>
r_images3 = r ' (?im)<td class= " TablePager_col_img_name " ><a[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+</td><td class= " TablePager_col_thumb " ><a[^>]+><img[^>]+></a></td><td class= " TablePager_col_img_size " >[^<]+</td><td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
#http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
#(<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
r_images4 = r ' (?im)<a href=[^>]+ title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a> '
m = [ ]
#different mediawiki versions
if re . search ( r_images1 , raw ) :
@ -436,6 +439,8 @@ def getImageFilenamesURL(config={}):
m = re . compile ( r_images2 ) . finditer ( raw )
elif re . search ( r_images3 , raw ) :
m = re . compile ( r_images3 ) . finditer ( raw )
elif re . search ( r_images4 , raw ) :
m = re . compile ( r_images4 ) . finditer ( raw )
for i in m :
url = i . group ( ' url ' )