mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-12 07:12:41 +00:00
bug invalid content, redownload
This commit is contained in:
parent
7280c89b3b
commit
8c30b3a2b9
@ -31,7 +31,7 @@ import urllib.request
|
||||
# zip command (apt-get install zip)
|
||||
# ia command (pip install internetarchive, and configured properly)
|
||||
|
||||
def saveURL(wikidomain='', url='', filename='', path='', overwrite=False):
|
||||
def saveURL(wikidomain='', url='', filename='', path='', overwrite=False, iteration=1):
|
||||
filename2 = '%s/%s' % (wikidomain, filename)
|
||||
if path:
|
||||
filename2 = '%s/%s/%s' % (wikidomain, path, filename)
|
||||
@ -57,6 +57,18 @@ def saveURL(wikidomain='', url='', filename='', path='', overwrite=False):
|
||||
except:
|
||||
sleep = sleep * 2
|
||||
print('Download failed')
|
||||
|
||||
#sometimes wikispaces returns invalid data, redownload in that cases
|
||||
if os.path.exists(filename2) and \
|
||||
filename2.split('.')[-1].lower() in ['csv', 'html', 'wikitext', 'xml']:
|
||||
sleep2 = 60 * iteration
|
||||
raw = ''
|
||||
with open(filename2, 'r') as f:
|
||||
raw = f.read()
|
||||
if re.findall(r'(?im)<title>TES and THE Status</title>', raw):
|
||||
print('Warning: invalid content. Waiting %d seconds and re-downloading' % (sleep2))
|
||||
time.sleep(sleep2)
|
||||
saveURL(wikidomain=wikidomain, url=url, filename=filename, path=path, overwrite=overwrite, iteration=iteration+1)
|
||||
|
||||
def undoHTMLEntities(text=''):
|
||||
""" Undo some HTML codes """
|
||||
|
Loading…
Reference in New Issue
Block a user