keep rewriting code for python3 and in modules

pull/287/head
emijrp 8 years ago
parent 6caf98415c
commit 96c86080d0

@ -24,13 +24,50 @@ import urllib
import wikiteam
def mwCleanHTML(raw=''):
""" Extract only the real wiki content and remove rubbish """
""" This function is ONLY used to retrieve page titles and file names when no API is available """
""" DO NOT use this function to extract page content """
# different "tags" used by different MediaWiki versions to mark where
# starts and ends content
if re.search('<!-- bodytext -->', raw):
raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]
elif re.search('<!-- start content -->', raw):
raw = raw.split(
'<!-- start content -->')[1].split('<!-- end content -->')[0]
elif re.search('<!-- Begin Content Area -->', raw):
raw = raw.split(
'<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0]
elif re.search('<!-- content -->', raw):
raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0]
elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw):
raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[1].split('</article>')[0]
elif re.search('<body class=', raw):
raw = raw.split('<body class=')[1].split('<div class="printfooter">')[0]
else:
print raw[:250]
sys.stderr.write('This wiki doesn\'t use marks to split content\n')
sys.exit()
return raw
def mwCleanXML(xml=''):
""" Trim redundant info """
# do not touch XML codification, leave AS IS
if re.search(r'</siteinfo>\n', xml):
xml = xml.split('</siteinfo>\n')[1]
if re.search(r'</mediawiki>', xml):
xml = xml.split('</mediawiki>')[0]
return xml
def mwCreateNewDump(config={}):
print('Trying generating a new dump into a new directory...')
if config['xml']:
titles = mwGetPageTitles(config=config)
mwSavePageTitles(config=config, images=images)
mwGeneratePageDump(config=config, titles=titles)
checkXMLIntegrity(config=config, titles=titles)
if config['pages']:
pagetitles = mwGetPageTitles(config=config)
wikiteam.savePageTitles(config=config, pagetitles=pagetitles)
mwGeneratePageDump(config=config, pagetitles=pagetitles)
checkXMLIntegrity(config=config, pagetitles=pagetitles)
if config['images']:
images = mwGetImageNames(config=config)
mwSaveImageNames(config=config, images=images)
@ -38,12 +75,11 @@ def mwCreateNewDump(config={}):
if config['logs']:
mwSaveLogs(config=config)
def mwGeneratePageDump(config={}, titles=[], start=None):
""" Generates a XML dump for a list of titles """
# TODO: titles is now unused.
print('Retrieving the XML for every page from "%s"' % (start or 'start'))
header, config = getXMLHeader(config=config)
def mwGeneratePageDump(config={}, pagetitles=None, start=None):
""" Generates a XML dump for page titles """
print('Retrieving XML for every page from "%s"' % (start or 'start'))
header = mwGetXMLHeader(config=config)
footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config),
config['date'],
@ -51,37 +87,37 @@ def mwGeneratePageDump(config={}, titles=[], start=None):
xmlfile = ''
lock = True
if start:
print("Removing the last chunk of past XML dump: it is probably incomplete.")
sys.stderr.write("Removing the last chunk of past XML dump: it is probably incomplete.\n")
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
xmlfile.write(header)
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
for pagetitle in mwGetPageTitles(config=config, start=start):
if not pagetitle.strip():
continue
if title == start: # start downloading from start, included
if pagetitle == start: # start downloading from start, included
lock = False
if lock:
continue
wikiteam.delay(config=config)
if c % 10 == 0:
print('Downloaded %d pages' % (c))
sys.stderr.write('Downloaded %d pages\n' % (c))
try:
for xml in getXMLPage(config=config, title=title):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
xmlfile.write(xml)
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
text='The page "%s" was missing in the wiki (probably deleted)' %
(title))
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
@ -90,7 +126,7 @@ def mwGeneratePageDump(config={}, titles=[], start=None):
c += 1
xmlfile.write(footer)
xmlfile.close()
print('XML dump saved at...', xmlfilename)
sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename))
def mwGetAPI(config={}):
""" Returns API for a MediaWiki wiki, if available """
@ -138,18 +174,17 @@ def mwGetNamespacesAPI(config={}):
namespaces = config['namespaces']
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
params = {'action': 'query',
data = {'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
data = urllib.parse.urlencode(params).encode()
r = wikiteam.getURL(url=config['mwapi'], data=data)
result = wikiteam.getJSON(r)
wikiteam.delay(config=config)
if 'all' in namespaces:
namespaces = []
for i in result['query']['namespaces'].keys():
if int(i) < 0: # -1: Special, -2: Media, excluding
if int(i) < 0: # Skipping -1: Special, -2: Media
continue
namespaces.append(int(i))
namespacenames[int(i)] = result['query']['namespaces'][i]['*']
@ -157,13 +192,11 @@ def mwGetNamespacesAPI(config={}):
# check if those namespaces really exist in this wiki
namespaces2 = []
for i in result['query']['namespaces'].keys():
bi = i
i = int(i)
if i < 0: # -1: Special, -2: Media, excluding
if int(i) < 0:
continue
if i in namespaces:
namespaces2.append(i)
namespacenames[i] = result['query']['namespaces'][bi]['*']
if int(i) in namespaces:
namespaces2.append(int(i))
namespacenames[int(i)] = result['query']['namespaces'][i]['*']
namespaces = namespaces2
else:
namespaces = [0]
@ -254,6 +287,276 @@ def mwGetPageTitlesAPI(config={}):
wikiteam.delay(config=config)
sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace))
def mwGetPageTitlesScraper(config={}):
""" Scrape list of page titles from Special:Allpages """
pagetitles = []
namespaces, namespacenames = mwGetNamespacesScraper(
config=config)
for namespace in namespaces:
sys.stderr.write(' Retrieving titles in namespace %s\n' % (namespace))
url = '%s?title=Special:Allpages&namespace=%s' % (
config['index'], namespace)
raw = wikiteam.getURL(url=url)
raw = mwCleanHTML(raw)
r_title = r'title="(?P<title>[^>]+)">'
r_suballpages = ''
r_suballpages1 = r'&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">'
r_suballpages3 = r'&amp;from=(?P<from>[^>]+)" title="[^>]+">'
if re.search(r_suballpages1, raw):
r_suballpages = r_suballpages1
elif re.search(r_suballpages2, raw):
r_suballpages = r_suballpages2
elif re.search(r_suballpages3, raw):
r_suballpages = r_suballpages3
else:
pass # perhaps no subpages
# 3 is the current deep of English Wikipedia for Special:Allpages
deep = 3
c = 0
checked_suballpages = []
rawacum = raw
while r_suballpages and re.search(r_suballpages, raw) and c < deep:
# load sub-Allpages
m = re.compile(r_suballpages).finditer(raw)
for i in m:
fr = i.group('from')
if r_suballpages == r_suballpages1:
to = i.group('to')
name = '%s-%s' % (fr, to)
url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
config['index'], namespace, fr, to) # do not put urllib.quote in fr or to
# fix, esta regexp no carga bien todas? o falla el r_title en
# este tipo de subpag? (wikiindex)
elif r_suballpages == r_suballpages2:
# clean &amp;namespace=\d, sometimes happens
fr = fr.split('&amp;namespace=')[0]
name = fr
url = '%s?title=Special:Allpages/%s&namespace=%s' % (
config['index'], name, namespace)
elif r_suballpages == r_suballpages3:
fr = fr.split('&amp;namespace=')[0]
name = fr
url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
config['index'], name, namespace)
if name not in checked_suballpages:
# to avoid reload dupe subpages links
checked_suballpages.append(name)
wikiteam.delay(config=config)
raw2 = wikiteam.getURL(url=url)
raw2 = mwCleanHTML(raw2)
rawacum += raw2 # merge it after removed junk
sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \
len(re.findall(r_suballpages, raw2)), \
len(re.findall(r_title, raw2))))
wikiteam.delay(config=config)
c += 1
c = 0
m = re.compile(r_title).finditer(rawacum)
for i in m:
t = wikiteam.undoHTMLEntities(text=i.group('title'))
if not t.startswith('Special:'):
if t not in pagetitles:
pagetitles.append(t)
c += 1
sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace))
return pagetitles
def mwGetXMLHeader(config={}):
""" Retrieve a random page to extract XML header (namespace info, etc) """
pagetitle = 'Main_Page'
try:
xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
except ExportAbortedError:
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
try:
if config['mwapi']:
sys.stderr.write("Trying the local name for the Special namespace instead\n")
xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
header = xml.split('</mediawiki>')[0]
if not re.match(r"\s*<mediawiki", xml):
sys.stderr.write('XML export on this wiki is broken, quitting.\n')
logerror('XML export on this wiki is broken, quitting.')
sys.exit()
return header
def mwGetXMLPage(config={}, pagetitle='', verbose=True):
""" Get the full history (or current only) of a page """
# if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
# http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
limit = 1000
truncated = False
pagetitle_ = re.sub(' ', '_', pagetitle)
# do not convert & into %26, pagetitle_ = re.sub('&', '%26', pagetitle_)
data = {'title': config['mwexport'], 'pages': pagetitle_, 'action': 'submit'}
if config['curonly']:
data['curonly'] = 1
data['limit'] = 1
else:
data['offset'] = '1' # 1 always < 2000s
data['limit'] = limit
# in other case, do not set data['templates']
if 'templates' in config and config['templates']: #fix, what is this option for?
data['templates'] = 1
xml = mwGetXMLPageCore(config=config, data=data)
if not xml:
raise ExportAbortedError(config['index'])
if not "</page>" in xml:
raise PageMissingError(data['title'], xml)
else:
# strip these sha1s sums which keep showing up in the export and
# which are invalid for the XML schema (they only apply to
# revisions)
xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
yield xml.split("</page>")[0]
# if complete history, check if this page history has > limit edits,
# if so, retrieve all revisions using offset if available
# else, warning about Special:Export truncating large page histories
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
numedits = 0
numedits += len(re.findall(r_timestamp, xml))
# search for timestamps in xml to avoid analysing empty pages like
# Special:Allpages and the random one
if not config['curonly'] and re.search(r_timestamp, xml):
while not truncated and data['offset']: # next chunk
# get the last timestamp from the acum XML
# assuming history is sorted chronologically
data['offset'] = re.findall(r_timestamp, xml)[-1]
try:
xml2 = mwGetXMLPageCore(config=config, data=data)
except MemoryError:
sys.stderr.write("Page history exceeds our memory, halving limit.\n")
data['limit'] = data['limit'] / 2
continue
# are there more edits in this next XML chunk or no <page></page>?
if re.findall(r_timestamp, xml2):
if re.findall(r_timestamp, xml2)[-1] == data['offset']:
# again the same XML, this wiki does not support params in
# Special:Export, offer complete XML up to X edits (usually
# 1000)
sys.stderr.write('ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated\n')
truncated = True
break
else:
""" </namespaces>
</siteinfo>
<page>
<title>Main Page</title>
<id>15580374</id>
<restrictions>edit=sysop:move=sysop</restrictions> (?)
<revision>
<id>418009832</id>
<timestamp>2011-03-09T19:57:06Z</timestamp>
<contributor>
"""
# offset is OK in this wiki, merge with the previous chunk
# of this page history and continue
try:
xml2 = xml2.split("</page>")[0]
yield ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
except MemoryError:
sys.stderr.write("Page's history exceeds our memory, halving limit.\n")
data['limit'] = data['limit'] / 2
continue
xml = xml2
numedits += len(re.findall(r_timestamp, xml))
else:
data['offset'] = '' # no more edits in this page history
yield "</page>\n"
if verbose:
if numedits == 1:
sys.stderr.write(' %s, 1 edit\n' % (pagetitle))
else:
sys.stderr.write(' %s, %d edits\n' % (pagetitle, numedits))
def mwGetXMLPageCore(config={}, data={}):
""" Returns a XML containing data['limit'] revisions (or current only), ending in </mediawiki>
if retrieving data['limit'] revisions fails, returns current only version
if all fail, returns empty string
"""
xml = ''
cretries = 0
maxseconds = 100 # max seconds to wait in a single sleeping
maxretries = config['retries'] # x retries and exit
increment = 20 # increment seconds every retry
while not re.search(r'</mediawiki>', xml):
if cretries > 0 and cretries < maxretries:
wait = increment * cretries < maxseconds and increment * \
cretries or maxseconds # incremental until maxseconds
sys.stderr.write(' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...\n' % (c, data['pages'], wait)
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function)
if data['limit'] > 1:
data['limit'] = data['limit'] / 2 # half
if cretries >= maxretries:
sys.stderr.write(' We have retried %d times\n' % (cretries))
sys.stderr.write(' MediaWiki error for "%s", probably network error...' % (data['pages']))
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last,
# data['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# mwGetXMLPageCore
if not config['curonly'] and not 'curonly' in data:
sys.stderr.write(' Trying to save only the last revision for this page...\n')
data['curonly'] = 1
logerror(
config=config,
text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' %
(data['pages'])
)
return mwGetXMLPageCore(config=config, data=data)
else:
sys.stderr.write(' Saving in error log, skipping...\n')
logerror(
config=config,
text='Error while retrieving last revision of "%s". Skipping.\n' %
(data['pages']))
raise ExportAbortedError(config['index'])
return '' # empty xml
# FIXME HANDLE HTTP Errors HERE
try:
r = wikiteam.getURL(url=config['index'], data=data)
#handleStatusCode(r)
#r = fixBOM(r)
xml = fixBOM(r)
except:
sys.stderr.write(' Connection error\n')
xml = ''
cretries += 1
return xml
def main():
pass

@ -32,13 +32,19 @@ import urllib
__version__ = "0.3.1"
"""
Stuff to check if works properly or re-add if needed:
* fixBOM
* sessions
"""
def avoidWikimediaProjects(config={}):
""" Skip Wikimedia projects and redirect to the dumps website """
# notice about wikipedia dumps
if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['wiki']):
sys.stderr.write('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!')
sys.stderr.write('Download Wikimedia dumps from https://dumps.wikimedia.org')
sys.stderr.write('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\n')
sys.stderr.write('Download Wikimedia dumps from https://dumps.wikimedia.org\n')
"""if not other['force']:
print 'Thanks!'
sys.exit()"""
@ -61,7 +67,7 @@ def createNewDump(config={}):
import wikispaces
wikispaces.wsCreateNewDump(config=config)
else:
sys.stderr.write("Wikiengine %s not supported. Exiting." % (config['wikiengine']))
sys.stderr.write("Wikiengine %s not supported. Exiting.\n" % (config['wikiengine']))
def createDumpPath(config={}):
# creating path or resuming if desired
@ -70,7 +76,7 @@ def createDumpPath(config={}):
originalpath = config['path']
# do not enter if resume is requested from begining
while not config['other']['resume'] and os.path.isdir(config['path']):
sys.stderr.write('\nWarning!: "%s" path exists' % (config['path']))
sys.stderr.write('\nWarning!: "%s" path exists\n' % (config['path']))
reply = ''
while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = input(
@ -80,16 +86,16 @@ def createDumpPath(config={}):
config['other']['configfilename']))
if reply.lower() in ['yes', 'y']:
if not os.path.isfile('%s/%s' % (config['path'], config['other']['configfilename'])):
sys.stderr.write('No config file found. I can\'t resume. Aborting.')
sys.stderr.write('No config file found. I can\'t resume. Aborting.\n')
sys.exit()
sys.stderr.write('You have selected: YES')
sys.stderr.write('You have selected: YES\n')
config['other']['resume'] = True
break
elif reply.lower() in ['no', 'n']:
sys.stderr.write('You have selected: NO')
sys.stderr.write('You have selected: NO\n')
config['other']['resume'] = False
config['path'] = '%s-%d' % (originalpath, c)
sys.stderr.write('Trying to use path "%s"...' % (config['path']))
sys.stderr.write('Trying to use path "%s"...\n' % (config['path']))
c += 1
return config
@ -270,21 +276,21 @@ def getParameters(params=[]):
# Not wiki? Exit
if not args.wiki:
sys.stderr.write('ERROR: Provide a URL to a wiki')
sys.stderr.write('ERROR: Provide a URL to a wiki\n')
parser.print_help()
sys.exit(1)
# Don't mix download params and meta info params
if (args.pages or args.images) and \
(args.get_api or args.get_index or args.get_page_titles or args.get_image_names or args.get_wiki_engine):
sys.stderr.write('ERROR: Don\'t mix download params and meta info params')
sys.stderr.write('ERROR: Don\'t mix download params and meta info params\n')
parser.print_help()
sys.exit(1)
# No download params and no meta info params? Exit
if (not args.pages and not args.images) and \
(not args.get_api and not args.get_index and not args.get_page_titles and not args.get_image_names and not args.get_wiki_engine):
sys.stderr.write('ERROR: Use at least one download param or meta info param')
sys.stderr.write('ERROR: Use at least one download param or meta info param\n')
parser.print_help()
sys.exit(1)
@ -292,11 +298,11 @@ def getParameters(params=[]):
cj = cookielib.MozillaCookieJar()
if args.cookies:
cj.load(args.cookies)
sys.stderr.write('Using cookies from %s' % args.cookies)
sys.stderr.write('Using cookies from %s\n' % args.cookies)
# check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user):
sys.stderr.write('ERROR: Both --user and --pass are required for authentication.')
sys.stderr.write('ERROR: Both --user and --pass are required for authentication.\n')
parser.print_help()
sys.exit(1)
@ -338,7 +344,7 @@ def getParameters(params=[]):
if re.search(
r'[^\d, \-]',
args.namespaces) and args.namespaces.lower() != 'all':
sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas")
sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n")
sys.exit()
else:
ns = re.sub(' ', '', args.namespaces)
@ -350,12 +356,12 @@ def getParameters(params=[]):
# Process namespace exclusions
if args.exnamespaces:
if re.search(r'[^\d, \-]', args.exnamespaces):
sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas")
sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n")
sys.exit(1)
else:
ns = re.sub(' ', '', args.exnamespaces)
if ns.lower() == 'all':
sys.stderr.write('You cannot exclude all namespaces.')
sys.stderr.write('You cannot exclude all namespaces.\n')
sys.exit(1)
else:
exnamespaces = [int(i) for i in ns.split(',')]
@ -394,15 +400,25 @@ def getParameters(params=[]):
# Get ready special variables (API for MediWiki, etc)
if config['wikiengine'] == 'mediawiki':
import mediawiki
config['mwexport'] = 'Special:Export'
if not args.mwapi:
config['mwapi'] = mediawiki.mwGetAPI(config=config)
if not config['mwapi']:
sys.stderr.write('ERROR: Provide a URL to API')
sys.stderr.write('ERROR: Provide a URL to API\n')
sys.exit(1)
else:
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
r = getURL(config['mwapi'], data=data)
config['mwexport'] = getJSON(r)['query']['namespaces']['-1']['*'] \
+ ':Export'
if not args.mwindex:
config['mwindex'] = mediawiki.mwGetIndex(config=config)
if not config['mwindex']:
sys.stderr.write('ERROR: Provide a URL to Index.php')
sys.stderr.write('ERROR: Provide a URL to Index.php\n')
sys.exit(1)
elif wikiengine == 'wikispaces':
import wikispaces
@ -415,14 +431,14 @@ def getParameters(params=[]):
return config
def getURL(url='', data=None):
# fix quizas pasandole el config pueda saber si esta definido el campo session y usarlo si interesa con un if
html = ''
req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
html = urllib.request.urlopen(req, data=data).read().decode().strip()
try:
data = urllib.parse.urlencode(data).encode()
req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
html = urllib.request.urlopen(req, data=data).read().decode().strip()
except:
sys.stderr.write("Error while retrieving URL", url)
sys.stderr.write("Error while retrieving URL: %s\n" % url)
sys.exit()
return html
@ -517,36 +533,42 @@ def getWikiEngine(url=''):
return wikiengine.lower()
def fixBOM(r):
"""Strip Unicode BOM"""
if request.text.startswith(u'\ufeff'):
request.encoding = 'utf-8-sig'
return request.text
def handleStatusCode(response):
statuscode = response.status_code
if statuscode >= 200 and statuscode < 300:
return
sys.stderr.write("HTTP Error %d." % statuscode)
sys.stderr.write("HTTP Error %d.\n" % statuscode)
if statuscode >= 300 and statuscode < 400:
sys.stderr.write("Redirect should happen automatically: please report this as a bug.")
sys.stderr.write(response.url)
sys.stderr.write("Redirect should happen automatically: please report this as a bug.\n")
sys.stderr.write('%s\n' % response.url)
elif statuscode == 400:
sys.stderr.write("Bad Request: The wiki may be malfunctioning.")
sys.stderr.write("Please try again later.")
sys.stderr.write(response.url)
sys.stderr.write("Bad Request: The wiki may be malfunctioning.\n")
sys.stderr.write("Please try again later.\n")
sys.stderr.write('%s\n' % response.url)
sys.exit(1)
elif statuscode == 401 or statuscode == 403:
sys.stderr.write("Authentication required.")
sys.stderr.write("Please use --userpass.")
sys.stderr.write(response.url)
sys.stderr.write("Authentication required.\n")
sys.stderr.write("Please use --userpass.\n")
sys.stderr.write('%s\n' % response.url)
elif statuscode == 404:
sys.stderr.write("Not found. Is Special:Export enabled for this wiki?")
sys.stderr.write(response.url)
sys.stderr.write("Not found. Is Special:Export enabled for this wiki?\n")
sys.stderr.write('%s\n' % response.url)
sys.exit(1)
elif statuscode == 429 or (statuscode >= 500 and statuscode < 600):
sys.stderr.write("Server error, max retries exceeded.")
sys.stderr.write("Please resume the dump later.")
sys.stderr.write(response.url)
sys.stderr.write("Server error, max retries exceeded.\n")
sys.stderr.write("Please resume the dump later.\n")
sys.stderr.write('%s\n' % response.url)
sys.exit(1)
def resumePreviousDump(config={}):
@ -557,7 +579,7 @@ def resumePreviousDump(config={}):
import wikispaces
wikispaces.wsResumePreviousDump(config=config)
else:
sys.stderr.write("Wikiengine %s not supported. Exiting." % (config['wikiengine']))
sys.stderr.write("Wikiengine %s not supported. Exiting.\n" % (config['wikiengine']))
def saveConfig(config={}):
""" Save config file """
@ -566,19 +588,48 @@ def saveConfig(config={}):
config2 = config.copy()
config2['other'] = {}
with open('%s/%s' % (config['path'], config['other']['configfilename']), 'w') as outfile:
sys.stderr.write('Saving config file...')
sys.stderr.write('Saving config file...\n')
try: #str
cPickle.dump(config2, outfile)
except: #bytes
with open('%s/%s' % (config['path'], config['other']['configfilename']), 'wb') as outfile:
cPickle.dump(config2, outfile)
def savePageTitles(config={}, pagetitles=None):
pagetitlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date'])
with open('%s/%s' % (config['path'], pagetitlesfilename), 'wt') as f:
for pagetitle in pagetitles:
output = '%s\n' % (pagetitle)
f.write(output)
# TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
# main namespace and widget namespace.
# We can use sort -u in UNIX, but is it worth it?
f.write('--END--\n')
f.close()
sys.stderr.write('Page titles saved at... %s\n' % (pagetitlesfilename))
def undoHTMLEntities(text=''):
""" Undo some HTML codes """
# i guess only < > & " ' need conversion
# http://www.w3schools.com/html/html_entities.asp
text = re.sub('&lt;', '<', text)
text = re.sub('&gt;', '>', text)
text = re.sub('&amp;', '&', text)
text = re.sub('&quot;', '"', text)
text = re.sub('&#039;', '\'', text)
return text
def welcome():
""" Print opening message """
message = """
#########################################################################
# Welcome to WikiTeam's tools v%s (GPL v3) #
# Tools for downloading and preserving wikis #
# More info at: https://github.com/WikiTeam/wikiteam #
#########################################################################
@ -605,10 +656,10 @@ def loadConfig(config={}):
try:
with open('%s/%s' % (config['path'], config['other']['configfilename']), 'r') as infile:
sys.stderr.write('Loading config file...')
sys.stderr.write('Loading config file...\n')
config = cPickle.load(infile)
except:
sys.stderr.write('ERROR: There is no config file. we can\'t resume. Start a new dump.')
sys.stderr.write('ERROR: There is no config file. we can\'t resume. Start a new dump.\n')
sys.exit()
return config
@ -616,17 +667,16 @@ def loadConfig(config={}):
def main(params=[]):
""" Main function """
welcome()
config = getParameters(params=params)
avoidWikimediaProjects(config=config)
config = createDumpPath(config=config)
if config['other']['resume']:
# Resume dump
welcome()
config = loadConfig(config=config)
resumePreviousDump(config=config)
elif config['pages'] or config['images'] or config['logs']:
# New dump
welcome()
os.mkdir(config['path'])
saveConfig(config=config)
createNewDump(config=config)
@ -639,7 +689,7 @@ def main(params=[]):
elif config['metainfo'] == 'get_page_titles':
printPageTitles(config=config)
elif config['metainfo'] == 'get_image_names':
printGetImageNames(config=config))
printGetImageNames(config=config)
elif config['metainfo'] == 'get_wiki_engine':
sys.stdout.write(config['wikiengine'])
sys.exit()

Loading…
Cancel
Save