fixing #127 and #134 , now works with APIs that returns 'name' field for images and those that don't do it (in this case we unquote over ascii); also fixing bug that re-download image list when it was completed previously

pull/137/head
Emilio J. Rodríguez-Posada 10 years ago
parent 005de23c1d
commit d4eed1f738

@ -552,8 +552,8 @@ def saveImageFilenamesURL(config={}, images=[]):
imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date']) imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
imagesfile.write('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images])) output = u"%s\n--END--" % (u'\n'.join([u'%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]))
imagesfile.write('\n--END--') imagesfile.write(output.encode('utf-8'))
imagesfile.close() imagesfile.close()
print 'Image filenames and URLs saved at...', imagesfilename print 'Image filenames and URLs saved at...', imagesfilename
@ -572,9 +572,9 @@ def getImageFilenamesURL(config={}):
req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() raw = unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8')
else: else:
raw = f.read() raw = unicode(f.read(), 'utf-8')
f.close() f.close()
delay(config=config) delay(config=config)
if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
@ -618,7 +618,7 @@ def getImageFilenamesURL(config={}):
if url[0] == '/': #slash is added later if url[0] == '/': #slash is added later
url = url[1:] url = url[1:]
domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
url = '%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
url = undoHTMLEntities(text=url) url = undoHTMLEntities(text=url)
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
url = re.sub(' ', '_', url) url = re.sub(' ', '_', url)
@ -669,17 +669,17 @@ def getImageFilenamesURLAPI(config={}):
print 'Please, resume the dump, --resume' print 'Please, resume the dump, --resume'
sys.exit() sys.exit()
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
jsonimages = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()) jsonimages = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8'))
else: else:
jsonimages = json.loads(f.read()) jsonimages = json.loads(unicode(f.read(), 'utf-8'))
f.close() f.close()
print jsonimages #print jsonimages
delay(config=config) delay(config=config)
aifrom = '' aifrom = ''
if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'): if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'):
if jsontitles['query-continue']['allimages'].has_key('aicontinue'): if jsonimages['query-continue']['allimages'].has_key('aicontinue'):
aifrom = jsonimages['query-continue']['allimages']['aicontinue'] aifrom = jsonimages['query-continue']['allimages']['aicontinue']
elif jsontitles['query-continue']['allimages'].has_key('aifrom'): elif jsonimages['query-continue']['allimages'].has_key('aifrom'):
aifrom = jsonimages['query-continue']['allimages']['aifrom'] aifrom = jsonimages['query-continue']['allimages']['aifrom']
#print aifrom #print aifrom
@ -689,9 +689,13 @@ def getImageFilenamesURLAPI(config={}):
if url[0] == '/': #slash is added later if url[0] == '/': #slash is added later
url = url[1:] url = url[1:]
domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
url = '%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
url = re.sub(' ', '_', url) url = re.sub(' ', '_', url)
filename = re.sub('_', ' ', url.split('/')[-1]) if image.has_key('name'):
filename = re.sub('_', ' ', image['name'])
else:
#some tips http://stackoverflow.com/questions/5139249/python-url-unquote-unicode
filename = re.sub('_', ' ', unicode(urllib2.unquote(url.encode('ascii')).split('/')[-1], 'utf-8'))
uploader = re.sub('_', ' ', image['user']) uploader = re.sub('_', ' ', image['user'])
images.append([filename, url, uploader]) images.append([filename, url, uploader])
@ -750,11 +754,12 @@ def generateImageDump(config={}, other={}, images=[], start=''):
class URLopenerUserAgent(urllib.FancyURLopener): class URLopenerUserAgent(urllib.FancyURLopener):
version = "%s" % getUserAgent() version = "%s" % getUserAgent()
urllib._urlopener = URLopenerUserAgent() urllib._urlopener = URLopenerUserAgent()
urllib.urlretrieve(url=url, filename='%s/%s' % (imagepath, filename2) ) filename3 = u'%s/%s' % (imagepath, filename2)
urllib.urlretrieve(url=url, filename=filename3.encode('utf-8'))
# TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works? # TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works?
#saving description if any #saving description if any
xmlfiledesc = getXMLFileDesc(config=config, title='Image:%s' % (filename)) # use Image: for backwards compatibility xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename)) # use Image: for backwards compatibility
f = open('%s/%s.desc' % (imagepath, filename2), 'w') f = open('%s/%s.desc' % (imagepath, filename2), 'w')
if not re.search(r'</mediawiki>', xmlfiledesc): #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text> if not re.search(r'</mediawiki>', xmlfiledesc): #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
#failure when retrieving desc? then save it as empty .desc #failure when retrieving desc? then save it as empty .desc
@ -1185,7 +1190,7 @@ def resumePreviousDump(config={}, other={}):
lastimage = '' lastimage = ''
try: try:
f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
raw = f.read() raw = unicode(f.read(), 'utf-8').strip()
lines = raw.split('\n') lines = raw.split('\n')
for l in lines: for l in lines:
if re.search(r'\t', l): if re.search(r'\t', l):
@ -1194,7 +1199,7 @@ def resumePreviousDump(config={}, other={}):
f.close() f.close()
except: except:
pass #probably file doesnot exists pass #probably file doesnot exists
if lastimage == '--END--': if lastimage == u'--END--':
print 'Image list was completed in the previous session' print 'Image list was completed in the previous session'
else: else:
print 'Image list is incomplete. Reloading...' print 'Image list is incomplete. Reloading...'

Loading…
Cancel
Save