From 195ffa437d78816c0358b022bfab4b0a2cdc5157 Mon Sep 17 00:00:00 2001 From: emijrp Date: Fri, 8 Apr 2011 13:39:14 +0000 Subject: [PATCH] file desc git-svn-id: https://wikiteam.googlecode.com/svn/trunk@30 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- dumpgenerator.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 04efb04..15c90b1 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -130,6 +130,10 @@ def getXMLHeader(config={}): header = xml.split('')[0] return header +def getXMLFileDesc(config={}, title=''): + config['curonly'] = 1 #tricky to get only the most recent desc + return getXMLPage(config=config, title=title) + def getXMLPage(config={}, title=''): #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F limit = 1000 @@ -313,7 +317,23 @@ def generateImageDump(config={}, images=[], start=''): if lock: continue delay(config=config) - urllib.urlretrieve(url, '%s/%s' % (imagepath, filename)) + #saving file + urllib.urlretrieve(url, '%s/%s' % (imagepath, filename)) + #saving description if any + xmlfiledesc = getXMLFileDesc(config=config, title='Image:%s' % (filename)) + f = open('%s/%s.desc' % (imagepath, filename), 'w') + if re.search(r'', xmlfiledesc): + #empty desc + xmlfiledesc = '' + elif re.search(r'', xmlfiledesc): + xmlfiledesc = xmlfiledesc.split('')[1].split('')[0] + xmlfiledesc = re.sub('<', '<', xmlfiledesc) # i guess only < > & need coversion http://www.w3schools.com/html/html_entities.asp + xmlfiledesc = re.sub('>', '>', xmlfiledesc) + xmlfiledesc = re.sub('&', '&', xmlfiledesc) + else: #failure when retrieving desc? + xmlfiledesc = '' + f.write(xmlfiledesc) + f.close() c += 1 if c % 10 == 0: print ' Downloaded %d images' % (c)