#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see
', wikitext) if m: try: wikitext = wikitext.split(m[0])[1].split('')[0].strip() wikitext = undoHTMLEntities(text=wikitext) except: pass f.write(wikitext) def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False): pagenameplus = re.sub(' ', '+', pagename) pagename_ = urllib.parse.quote(pagename) #page current revision (html & wikitext) pageurl = '%s/%s' % (wikiurl, pagename_) filename = '%s.html' % (pagenameplus) print('Downloading page: %s' % (filename)) saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite) pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_) filename2 = '%s.wikitext' % (pagenameplus) print('Downloading page: %s' % (filename2)) saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite) convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages') #csv with page history csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_) csvfilename = '%s.history.csv' % (pagenameplus) print('Downloading page: %s' % (csvfilename)) saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite) def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False): filenameplus = re.sub(' ', '+', filename) filename_ = urllib.parse.quote(filename) #file full resolution fileurl = '%s/file/view/%s' % (wikiurl, filename_) filename = filenameplus print('Downloading file: %s' % (filename)) saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite) #csv with file history csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_) csvfilename = '%s.history.csv' % (filenameplus) print('Downloading file: %s' % (csvfilename)) saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite) def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False): print('Downloading Pages and Files from %s' % (wikiurl)) #csv all pages and files csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl) saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='') #download every page and file totallines = 0 with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f: totallines = len(f.read().splitlines()) - 1 with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile: filesc = 0 pagesc = 0 print('This wiki has %d pages and files' % (totallines)) rows = csv.reader(csvfile, delimiter=',', quotechar='"') for row in rows: if row[0] == 'file': filesc += 1 filename = row[1] downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite) elif row[0] == 'page': pagesc += 1 pagename = row[1] downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite) if (filesc + pagesc) % 10 == 0: print(' Progress: %d of %d' % ((filesc + pagesc), totallines)) print(' Progress: %d of %d' % ((filesc + pagesc), totallines)) print('Downloaded %d pages' % (pagesc)) print('Downloaded %d files' % (filesc)) def downloadSitemap(wikidomain='', wikiurl='', overwrite=False): print('Downloading sitemap.xml') saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite) def downloadMainPage(wikidomain='', wikiurl='', overwrite=False): print('Downloading index.html') saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite) def downloadLogo(wikidomain='', wikiurl='', overwrite=False): index = '%s/index.html' % (wikidomain) if os.path.exists(index): raw = '' try: with open(index, 'r', encoding='utf-8') as f: raw = f.read() except: with open(index, 'r', encoding='latin-1') as f: raw = f.read() m = re.findall(r'class="WikiLogo WikiElement"> 2: if '--upload' in sys.argv: upload = True if '--admin' in sys.argv: isadmin = True if '--overwrite' in sys.argv: overwrite = True if '--overwrite-ia' in sys.argv: overwriteia = True if '--help' in sys.argv: printhelp() wikilist = [] if '://' in param: wikilist.append(param.rstrip('/')) elif param.lower() == 'duckduckgo': wikilist = duckduckgo() #for wiki in wikilist: # print(wiki) else: with open(param, 'r') as f: wikilist = f.read().strip().splitlines() wikilist2 = [] for wiki in wikilist: wikilist2.append(wiki.rstrip('/')) wikilist = wikilist2 for wikiurl in wikilist: wikidomain = wikiurl.split('://')[1].split('/')[0] print('\n') print('#'*40,'\n Downloading:', wikiurl) print('#'*40,'\n') if upload and not overwriteia: itemid = 'wiki-%s' % (wikidomain) try: iahtml = '' try: iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8') except: time.sleep(10) iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8') if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml): if not overwriteia: print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia') print('You can find it in https://archive.org/details/%s' % (itemid)) time.sleep(1) continue except: pass dirfiles = '%s/files' % (wikidomain) if not os.path.exists(dirfiles): print('Creating directory %s' % (dirfiles)) os.makedirs(dirfiles) dirpages = '%s/pages' % (wikidomain) if not os.path.exists(dirpages): print('Creating directory %s' % (dirpages)) os.makedirs(dirpages) sitemapurl = 'https://%s/sitemap.xml' % (wikidomain) downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite) if not os.path.exists('%s/sitemap.xml' % (wikidomain)): print('Error, wiki was probably deleted. Skiping wiki...') continue else: sitemapraw = '' try: with open('%s/sitemap.xml' % (wikidomain), encoding='utf-8') as g: sitemapraw = g.read() except: with open('%s/sitemap.xml' % (wikidomain), encoding='latin-1') as g: sitemapraw = g.read() if re.search(r'(?im)