From 2be584acbaa5d12bfbae87ebaf11dd2811edff04 Mon Sep 17 00:00:00 2001 From: emijrp Date: Fri, 8 Apr 2011 14:57:36 +0000 Subject: [PATCH] adding uploader name to image list git-svn-id: https://wikiteam.googlecode.com/svn/trunk@31 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- dumpgenerator.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 15c90b1..c903821 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -255,7 +255,7 @@ def saveImageFilenamesURL(config={}, images=[]): #save list of images and their urls imagesfilename = '%s-%s-images.txt' % (domain2prefix(domain=config['domain']), config['date']) imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') - imagesfile.write('\n'.join(['%s\t%s' % (filename, url) for filename, url in images])) + imagesfile.write('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images])) imagesfile.write('\n--END--') imagesfile.close() print 'Image filenames and URLs saved at...', imagesfilename @@ -270,8 +270,9 @@ def getImageFilenamesURL(config={}, start='!'): url = '%s?title=Special:Imagelist&limit=5000&offset=%s' % (config['domain'], offset) raw = urllib.urlopen(url).read() raw = cleanHTML(raw) - #Yahoovideo.jpg (file) - m = re.compile(r'(?i)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+').finditer(raw) + #archiveteam Yahoovideo.jpg (file) + #wikanda Fernandocg + m = re.compile(r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)').finditer(raw) for i in m: url = i.group('url') if url[0] == '/': #relative URL @@ -282,7 +283,8 @@ def getImageFilenamesURL(config={}, start='!'): url = '%s%s' % (config['domain'].split('/index.php')[0], url) filename = re.sub('_', ' ', i.group('filename')) filename_ = re.sub(' ', '_', i.group('filename')) - images.append([filename, url]) + uploader = re.sub('_', ' ', i.group('uploader')) + images.append([filename, url, uploader]) #print filename, url if re.search(r_next, raw): @@ -311,7 +313,7 @@ def generateImageDump(config={}, images=[], start=''): lock = True if not start: lock = False - for filename, url in images: + for filename, url, uploader in images: if filename == start: #start downloading from start, included lock = False if lock: @@ -589,7 +591,7 @@ def main(): lastfilename = '' lastfilename2 = '' c = 0 - for filename, url in images: + for filename, url, uploader in images: if filename not in listdir: complete = False lastfilename2 = lastfilename