2
0
mirror of https://github.com/WikiTeam/wikiteam synced 2024-11-12 07:12:41 +00:00

utf8 latin1

This commit is contained in:
emijrp 2018-05-20 20:36:08 +02:00
parent 3a56037279
commit 3b0d4fef5e

View File

@ -183,14 +183,20 @@ def downloadMainPage(wikidomain='', wikiurl='', overwrite=False):
def downloadLogo(wikidomain='', wikiurl='', overwrite=False):
index = '%s/index.html' % (wikidomain)
if os.path.exists(index):
with open(index, 'r') as f:
m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', f.read())
if m:
logourl = m[0]
logofilename = logourl.split('/')[-1]
print('Downloading logo')
saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
return logofilename
raw = ''
try:
with open(index, 'r', encoding='utf-8') as f:
raw = f.read()
except:
with open(index, 'r', encoding='latin-1') as f:
raw = f.read()
m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', raw)
if m:
logourl = m[0]
logofilename = logourl.split('/')[-1]
print('Downloading logo')
saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
return logofilename
return ''
def printhelp():
@ -266,7 +272,7 @@ def main():
except:
time.sleep(10)
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
if iahtml and not re.findall(r'Item cannot be found', iahtml):
if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml):
if not overwriteia:
print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
print('You can find it in https://archive.org/details/%s' % (itemid))
@ -305,9 +311,14 @@ def main():
if not os.path.exists(indexfilename):
print('\nError dump incomplete, skipping upload\n')
continue
f = open(indexfilename, 'r')
indexhtml = f.read()
f.close()
indexhtml = ''
try:
with open(indexfilename, 'r', encoding='utf-8') as f:
indexhtml = f.read()
except:
with open(indexfilename, 'r', encoding='latin-1') as f:
indexhtml = f.read()
wikititle = ''
try:
wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()