2
0
mirror of https://github.com/WikiTeam/wikiteam synced 2024-11-04 12:00:28 +00:00

relative urls for images...

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@19 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
This commit is contained in:
emijrp 2011-04-07 17:28:08 +00:00
parent 161a7ee663
commit cb12252aaf

View File

@ -43,8 +43,9 @@ import urllib2
# fix use api when available
def delay(config={}):
print 'Sleeping... %d seconds...' % (config['delay'])
time.sleep(config['delay'])
if config['delay'] > 0:
print 'Sleeping... %d seconds...' % (config['delay'])
time.sleep(config['delay'])
def cleanHTML(raw=''):
if re.search('<!-- bodytext -->', raw): #<!-- bodytext --> <!-- /bodytext --> <!-- start content --> <!-- end content -->
@ -224,14 +225,20 @@ def generateImageDump(config={}):
url = '%s?title=Special:Imagelist&limit=5000&offset=%s' % (config['domain'], offset)
raw = urllib.urlopen(url).read()
raw = cleanHTML(raw)
m = re.compile(r'<a href="(?P<url>[^>]+/./../[^>]+)">[^<]+</a>').finditer(raw)
#<td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
m = re.compile(r'(?i)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>').finditer(raw)
for i in m:
url = i.group('url')
if url[0] == '/': #relative URL ZOMG!
url = '%s%s' % (config['domain'].split('/index.php')[0], url)
filename = re.sub('_', ' ', url.split('/')[-1])
filename_ = re.sub(' ', '_', url.split('/')[-1])
if url[0] == '/': #relative URL
if re.search(r'\.\./', url): #../ weird paths (see wikanda)
x = len(re.findall(r'\.\./', url)) + 1
url = '%s/%s' % ('/'.join(config['domain'].split('/')[:-x]), url.split('../')[-1])
else:
url = '%s%s' % (config['domain'].split('/index.php')[0], url)
filename = re.sub('_', ' ', i.group('filename'))
filename_ = re.sub(' ', '_', i.group('filename'))
images.append([filename, url])
print filename, url
if re.search(r_next, raw):
offset = re.findall(r_next, raw)[0]
@ -300,6 +307,7 @@ More info at: http://code.google.com/p/wikiteam/"""
def bye(config={}):
print "Your dump is in %s" % (config['path'])
print "If you found any bug, report a new issue here (Gmail account required): http://code.google.com/p/wikiteam/issues/list"
print "Good luck! Bye!"
def usage():