improving image list downloader

pull/155/head
Emilio J. Rodríguez-Posada 10 years ago
parent c07b527e5d
commit 88c9468c0e

@ -75,6 +75,8 @@ def cleanHTML(raw=''):
raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0]
elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw):
raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[1].split('</article>')[0]
elif re.search('<body class=', raw):
raw = raw.split('<body class=')[1].split('<div class="printfooter">')[0]
else:
print raw[:250]
print 'This wiki doesn\'t use marks to split content'
@ -526,6 +528,30 @@ def saveImageFilenamesURL(config={}, images=[], session=None):
print 'Image filenames and URLs saved at...', imagesfilename
def curateImageURL(config={}, url=''):
""" Returns an absolute URL for an image, adding the domain if missing """
if 'index' in config:
#remove from :// (http or https) until the first / after domain
domainalone = config['index'].split('://')[0] + '://' + config['index'].split('://')[1].split('/')[0]
elif 'api' in config:
domainalone = config['api'].split('://')[0] + '://' + config['api'].split('://')[1].split('/')[0]
else:
print 'ERROR: no index nor API'
sys.exit()
if url.startswith('//'): # Orain wikifarm returns URLs starting with //
url = u'%s:%s' % (domainalone.split('://')[0], url)
elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
if url[0] == '/': #slash is added later
url = url[1:]
url = u'%s/%s' % (domainalone, url) # concat http(s) + domain + relative url
url = undoHTMLEntities(text=url)
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
url = re.sub(' ', '_', url)
return url
def getImageFilenamesURL(config={}, session=None):
""" Retrieve file list: filename, url, uploader """
@ -564,27 +590,27 @@ def getImageFilenamesURL(config={}, session=None):
#http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
#(<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
m = []
#different mediawiki versions
if re.search(r_images1, raw):
m = re.compile(r_images1).finditer(raw)
elif re.search(r_images2, raw):
m = re.compile(r_images2).finditer(raw)
elif re.search(r_images3, raw):
m = re.compile(r_images3).finditer(raw)
elif re.search(r_images4, raw):
m = re.compile(r_images4).finditer(raw)
r_images5 = (r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
'<td class="TablePager_col_img_size">[^<]*?</td>\s*'
'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>')
# Select the regexp that returns more results
regexps = [r_images1, r_images2, r_images3, r_images4, r_images5]
count = 0
i = 0
regexp_best = 0
for regexp in regexps:
if len(re.findall(regexp, raw)) > count:
count = len(re.findall(regexp, raw))
regexp_best = i
i += 1
m = re.compile(regexps[regexp_best]).finditer(raw)
# Iter the image results
for i in m:
url = i.group('url')
if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
if url[0] == '/': #slash is added later
url = url[1:]
domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
url = undoHTMLEntities(text=url)
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
url = re.sub(' ', '_', url)
url = curateImageURL(config=config, url=url)
filename = re.sub('_', ' ', i.group('filename'))
filename = undoHTMLEntities(text=filename)
filename = urllib.unquote(filename)
@ -612,6 +638,7 @@ def getImageFilenamesURLAPI(config={}, session=None):
""" Retrieve file list: filename, url, uploader """
print 'Retrieving image filenames'
oldAPI = False
aifrom = '!'
images = []
while aifrom:
@ -622,26 +649,56 @@ def getImageFilenamesURLAPI(config={}, session=None):
handleStatusCode(r)
jsonimages = json.loads(r.text)
delay(config=config, session=session)
aifrom = ''
if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'):
if jsonimages['query-continue']['allimages'].has_key('aicontinue'):
aifrom = jsonimages['query-continue']['allimages']['aicontinue']
elif jsonimages['query-continue']['allimages'].has_key('aifrom'):
aifrom = jsonimages['query-continue']['allimages']['aifrom']
#print aifrom
for image in jsonimages['query']['allimages']:
url = image['url']
if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
if url[0] == '/': #slash is added later
url = url[1:]
domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
url = re.sub(' ', '_', url)
# encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136
filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8')
uploader = re.sub('_', ' ', image['user'])
images.append([filename, url, uploader])
if 'query' in jsonimages:
aifrom = ''
if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'):
if jsonimages['query-continue']['allimages'].has_key('aicontinue'):
aifrom = jsonimages['query-continue']['allimages']['aicontinue']
elif jsonimages['query-continue']['allimages'].has_key('aifrom'):
aifrom = jsonimages['query-continue']['allimages']['aifrom']
#print aifrom
for image in jsonimages['query']['allimages']:
url = image['url']
url = curateImageURL(config=config, url=url)
# encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136
filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8')
uploader = re.sub('_', ' ', image['user'])
images.append([filename, url, uploader])
else:
oldAPI = True
break
if oldAPI:
gapfrom = '!'
images = []
while gapfrom:
sys.stderr.write('.') #progress
# Some old APIs doesn't have allimages query
# In this case use allpages (in nm=6) as generator for imageinfo
# Example: http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
params = {'action': 'query', 'generator': 'allpages', 'gapnamespace': 6, 'gaplimit': 500, 'gapfrom': gapfrom, 'prop': 'imageinfo', 'iiprop': 'user|url', 'format': 'json'}
#FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params)
handleStatusCode(r)
jsonimages = json.loads(r.text)
delay(config=config, session=session)
if 'query' in jsonimages:
gapfrom = ''
if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allpages'):
if jsonimages['query-continue']['allpages'].has_key('gapfrom'):
gapfrom = jsonimages['query-continue']['allpages']['gapfrom']
#print gapfrom
#print jsonimages['query']
for image, props in jsonimages['query']['pages'].items():
url = props['imageinfo'][0]['url']
url = curateImageURL(config=config, url=url)
filename = re.sub('_', ' ', ':'.join(props['title'].split(':')[1:]))
uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
images.append([filename, url, uploader])
if (len(images) == 1):
print ' Found 1 image'

Loading…
Cancel
Save