mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-04 12:00:28 +00:00
relative urls for images...
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@19 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
This commit is contained in:
parent
161a7ee663
commit
cb12252aaf
@ -43,8 +43,9 @@ import urllib2
|
||||
# fix use api when available
|
||||
|
||||
def delay(config={}):
|
||||
print 'Sleeping... %d seconds...' % (config['delay'])
|
||||
time.sleep(config['delay'])
|
||||
if config['delay'] > 0:
|
||||
print 'Sleeping... %d seconds...' % (config['delay'])
|
||||
time.sleep(config['delay'])
|
||||
|
||||
def cleanHTML(raw=''):
|
||||
if re.search('<!-- bodytext -->', raw): #<!-- bodytext --> <!-- /bodytext --> <!-- start content --> <!-- end content -->
|
||||
@ -224,14 +225,20 @@ def generateImageDump(config={}):
|
||||
url = '%s?title=Special:Imagelist&limit=5000&offset=%s' % (config['domain'], offset)
|
||||
raw = urllib.urlopen(url).read()
|
||||
raw = cleanHTML(raw)
|
||||
m = re.compile(r'<a href="(?P<url>[^>]+/./../[^>]+)">[^<]+</a>').finditer(raw)
|
||||
#<td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
|
||||
m = re.compile(r'(?i)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>').finditer(raw)
|
||||
for i in m:
|
||||
url = i.group('url')
|
||||
if url[0] == '/': #relative URL ZOMG!
|
||||
url = '%s%s' % (config['domain'].split('/index.php')[0], url)
|
||||
filename = re.sub('_', ' ', url.split('/')[-1])
|
||||
filename_ = re.sub(' ', '_', url.split('/')[-1])
|
||||
if url[0] == '/': #relative URL
|
||||
if re.search(r'\.\./', url): #../ weird paths (see wikanda)
|
||||
x = len(re.findall(r'\.\./', url)) + 1
|
||||
url = '%s/%s' % ('/'.join(config['domain'].split('/')[:-x]), url.split('../')[-1])
|
||||
else:
|
||||
url = '%s%s' % (config['domain'].split('/index.php')[0], url)
|
||||
filename = re.sub('_', ' ', i.group('filename'))
|
||||
filename_ = re.sub(' ', '_', i.group('filename'))
|
||||
images.append([filename, url])
|
||||
print filename, url
|
||||
|
||||
if re.search(r_next, raw):
|
||||
offset = re.findall(r_next, raw)[0]
|
||||
@ -300,6 +307,7 @@ More info at: http://code.google.com/p/wikiteam/"""
|
||||
|
||||
def bye(config={}):
|
||||
print "Your dump is in %s" % (config['path'])
|
||||
print "If you found any bug, report a new issue here (Gmail account required): http://code.google.com/p/wikiteam/issues/list"
|
||||
print "Good luck! Bye!"
|
||||
|
||||
def usage():
|
||||
|
Loading…
Reference in New Issue
Block a user