adding uploader name to image list

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@31 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
pull/117/head
emijrp 13 years ago
parent 195ffa437d
commit 2be584acba

@ -255,7 +255,7 @@ def saveImageFilenamesURL(config={}, images=[]):
#save list of images and their urls #save list of images and their urls
imagesfilename = '%s-%s-images.txt' % (domain2prefix(domain=config['domain']), config['date']) imagesfilename = '%s-%s-images.txt' % (domain2prefix(domain=config['domain']), config['date'])
imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
imagesfile.write('\n'.join(['%s\t%s' % (filename, url) for filename, url in images])) imagesfile.write('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]))
imagesfile.write('\n--END--') imagesfile.write('\n--END--')
imagesfile.close() imagesfile.close()
print 'Image filenames and URLs saved at...', imagesfilename print 'Image filenames and URLs saved at...', imagesfilename
@ -270,8 +270,9 @@ def getImageFilenamesURL(config={}, start='!'):
url = '%s?title=Special:Imagelist&limit=5000&offset=%s' % (config['domain'], offset) url = '%s?title=Special:Imagelist&limit=5000&offset=%s' % (config['domain'], offset)
raw = urllib.urlopen(url).read() raw = urllib.urlopen(url).read()
raw = cleanHTML(raw) raw = cleanHTML(raw)
#<td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td> #archiveteam <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
m = re.compile(r'(?i)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>').finditer(raw) #wikanda <td class="TablePager_col_img_user_text"><a href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1" class="new" title="Usuario:Fernandocg (página no existe)">Fernandocg</a></td>
m = re.compile(r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>').finditer(raw)
for i in m: for i in m:
url = i.group('url') url = i.group('url')
if url[0] == '/': #relative URL if url[0] == '/': #relative URL
@ -282,7 +283,8 @@ def getImageFilenamesURL(config={}, start='!'):
url = '%s%s' % (config['domain'].split('/index.php')[0], url) url = '%s%s' % (config['domain'].split('/index.php')[0], url)
filename = re.sub('_', ' ', i.group('filename')) filename = re.sub('_', ' ', i.group('filename'))
filename_ = re.sub(' ', '_', i.group('filename')) filename_ = re.sub(' ', '_', i.group('filename'))
images.append([filename, url]) uploader = re.sub('_', ' ', i.group('uploader'))
images.append([filename, url, uploader])
#print filename, url #print filename, url
if re.search(r_next, raw): if re.search(r_next, raw):
@ -311,7 +313,7 @@ def generateImageDump(config={}, images=[], start=''):
lock = True lock = True
if not start: if not start:
lock = False lock = False
for filename, url in images: for filename, url, uploader in images:
if filename == start: #start downloading from start, included if filename == start: #start downloading from start, included
lock = False lock = False
if lock: if lock:
@ -589,7 +591,7 @@ def main():
lastfilename = '' lastfilename = ''
lastfilename2 = '' lastfilename2 = ''
c = 0 c = 0
for filename, url in images: for filename, url, uploader in images:
if filename not in listdir: if filename not in listdir:
complete = False complete = False
lastfilename2 = lastfilename lastfilename2 = lastfilename

Loading…
Cancel
Save