diff --git a/wikiteam/mediawiki.py b/wikiteam/mediawiki.py index 9bb8797..b6d473a 100644 --- a/wikiteam/mediawiki.py +++ b/wikiteam/mediawiki.py @@ -24,13 +24,50 @@ import urllib import wikiteam +def mwCleanHTML(raw=''): + """ Extract only the real wiki content and remove rubbish """ + """ This function is ONLY used to retrieve page titles and file names when no API is available """ + """ DO NOT use this function to extract page content """ + + # different "tags" used by different MediaWiki versions to mark where + # starts and ends content + if re.search('', raw): + raw = raw.split('')[1].split('')[0] + elif re.search('', raw): + raw = raw.split( + '')[1].split('')[0] + elif re.search('', raw): + raw = raw.split( + '')[1].split('')[0] + elif re.search('', raw): + raw = raw.split('')[1].split('')[0] + elif re.search('
', raw): + raw = raw.split('
')[1].split('
')[0] + elif re.search('')[0] + else: + print raw[:250] + sys.stderr.write('This wiki doesn\'t use marks to split content\n') + sys.exit() + return raw + +def mwCleanXML(xml=''): + """ Trim redundant info """ + + # do not touch XML codification, leave AS IS + if re.search(r'\n', xml): + xml = xml.split('\n')[1] + if re.search(r'', xml): + xml = xml.split('')[0] + return xml + def mwCreateNewDump(config={}): print('Trying generating a new dump into a new directory...') - if config['xml']: - titles = mwGetPageTitles(config=config) - mwSavePageTitles(config=config, images=images) - mwGeneratePageDump(config=config, titles=titles) - checkXMLIntegrity(config=config, titles=titles) + if config['pages']: + pagetitles = mwGetPageTitles(config=config) + wikiteam.savePageTitles(config=config, pagetitles=pagetitles) + mwGeneratePageDump(config=config, pagetitles=pagetitles) + checkXMLIntegrity(config=config, pagetitles=pagetitles) if config['images']: images = mwGetImageNames(config=config) mwSaveImageNames(config=config, images=images) @@ -38,12 +75,11 @@ def mwCreateNewDump(config={}): if config['logs']: mwSaveLogs(config=config) -def mwGeneratePageDump(config={}, titles=[], start=None): - """ Generates a XML dump for a list of titles """ - # TODO: titles is now unused. - - print('Retrieving the XML for every page from "%s"' % (start or 'start')) - header, config = getXMLHeader(config=config) +def mwGeneratePageDump(config={}, pagetitles=None, start=None): + """ Generates a XML dump for page titles """ + + print('Retrieving XML for every page from "%s"' % (start or 'start')) + header = mwGetXMLHeader(config=config) footer = '\n' # new line at the end xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config), config['date'], @@ -51,37 +87,37 @@ def mwGeneratePageDump(config={}, titles=[], start=None): xmlfile = '' lock = True if start: - print("Removing the last chunk of past XML dump: it is probably incomplete.") + sys.stderr.write("Removing the last chunk of past XML dump: it is probably incomplete.\n") for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True): pass else: # requested complete xml dump lock = False xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') - xmlfile.write(header.encode('utf-8')) + xmlfile.write(header) xmlfile.close() xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') c = 1 - for title in readTitles(config, start): - if not title.strip(): + for pagetitle in mwGetPageTitles(config=config, start=start): + if not pagetitle.strip(): continue - if title == start: # start downloading from start, included + if pagetitle == start: # start downloading from start, included lock = False if lock: continue wikiteam.delay(config=config) if c % 10 == 0: - print('Downloaded %d pages' % (c)) + sys.stderr.write('Downloaded %d pages\n' % (c)) try: for xml in getXMLPage(config=config, title=title): xml = cleanXML(xml=xml) - xmlfile.write(xml.encode('utf-8')) + xmlfile.write(xml) except PageMissingError: logerror( config=config, - text=u'The page "%s" was missing in the wiki (probably deleted)' % - (title.decode('utf-8')) + text='The page "%s" was missing in the wiki (probably deleted)' % + (title)) ) # here, XML is a correct chunk or # an empty string due to a deleted page (logged in errors log) or @@ -90,7 +126,7 @@ def mwGeneratePageDump(config={}, titles=[], start=None): c += 1 xmlfile.write(footer) xmlfile.close() - print('XML dump saved at...', xmlfilename) + sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename)) def mwGetAPI(config={}): """ Returns API for a MediaWiki wiki, if available """ @@ -138,18 +174,17 @@ def mwGetNamespacesAPI(config={}): namespaces = config['namespaces'] namespacenames = {0: ''} # main is 0, no prefix if namespaces: - params = {'action': 'query', + data = {'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'} - data = urllib.parse.urlencode(params).encode() r = wikiteam.getURL(url=config['mwapi'], data=data) result = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'all' in namespaces: namespaces = [] for i in result['query']['namespaces'].keys(): - if int(i) < 0: # -1: Special, -2: Media, excluding + if int(i) < 0: # Skipping -1: Special, -2: Media continue namespaces.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] @@ -157,13 +192,11 @@ def mwGetNamespacesAPI(config={}): # check if those namespaces really exist in this wiki namespaces2 = [] for i in result['query']['namespaces'].keys(): - bi = i - i = int(i) - if i < 0: # -1: Special, -2: Media, excluding + if int(i) < 0: continue - if i in namespaces: - namespaces2.append(i) - namespacenames[i] = result['query']['namespaces'][bi]['*'] + if int(i) in namespaces: + namespaces2.append(int(i)) + namespacenames[int(i)] = result['query']['namespaces'][i]['*'] namespaces = namespaces2 else: namespaces = [0] @@ -254,6 +287,276 @@ def mwGetPageTitlesAPI(config={}): wikiteam.delay(config=config) sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace)) + +def mwGetPageTitlesScraper(config={}): + """ Scrape list of page titles from Special:Allpages """ + + pagetitles = [] + namespaces, namespacenames = mwGetNamespacesScraper( + config=config) + for namespace in namespaces: + sys.stderr.write(' Retrieving titles in namespace %s\n' % (namespace)) + url = '%s?title=Special:Allpages&namespace=%s' % ( + config['index'], namespace) + raw = wikiteam.getURL(url=url) + raw = mwCleanHTML(raw) + + r_title = r'title="(?P[^>]+)">' + r_suballpages = '' + r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' + r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' + r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">' + if re.search(r_suballpages1, raw): + r_suballpages = r_suballpages1 + elif re.search(r_suballpages2, raw): + r_suballpages = r_suballpages2 + elif re.search(r_suballpages3, raw): + r_suballpages = r_suballpages3 + else: + pass # perhaps no subpages + + # 3 is the current deep of English Wikipedia for Special:Allpages + deep = 3 + c = 0 + checked_suballpages = [] + rawacum = raw + while r_suballpages and re.search(r_suballpages, raw) and c < deep: + # load sub-Allpages + m = re.compile(r_suballpages).finditer(raw) + for i in m: + fr = i.group('from') + + if r_suballpages == r_suballpages1: + to = i.group('to') + name = '%s-%s' % (fr, to) + url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( + config['index'], namespace, fr, to) # do not put urllib.quote in fr or to + # fix, esta regexp no carga bien todas? o falla el r_title en + # este tipo de subpag? (wikiindex) + elif r_suballpages == r_suballpages2: + # clean &namespace=\d, sometimes happens + fr = fr.split('&namespace=')[0] + name = fr + url = '%s?title=Special:Allpages/%s&namespace=%s' % ( + config['index'], name, namespace) + elif r_suballpages == r_suballpages3: + fr = fr.split('&namespace=')[0] + name = fr + url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( + config['index'], name, namespace) + + if name not in checked_suballpages: + # to avoid reload dupe subpages links + checked_suballpages.append(name) + wikiteam.delay(config=config) + raw2 = wikiteam.getURL(url=url) + raw2 = mwCleanHTML(raw2) + rawacum += raw2 # merge it after removed junk + sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \ + len(re.findall(r_suballpages, raw2)), \ + len(re.findall(r_title, raw2)))) + + wikiteam.delay(config=config) + c += 1 + + c = 0 + m = re.compile(r_title).finditer(rawacum) + for i in m: + t = wikiteam.undoHTMLEntities(text=i.group('title')) + if not t.startswith('Special:'): + if t not in pagetitles: + pagetitles.append(t) + c += 1 + sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace)) + return pagetitles + +def mwGetXMLHeader(config={}): + """ Retrieve a random page to extract XML header (namespace info, etc) """ + + pagetitle = 'Main_Page' + try: + xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)]) + except PageMissingError as pme: + # The <page> does not exist. Not a problem, if we get the <siteinfo>. + xml = pme.xml + except ExportAbortedError: + # Issue 26: Account for missing "Special" namespace. + # Hope the canonical special name has not been removed. + # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases + try: + if config['mwapi']: + sys.stderr.write("Trying the local name for the Special namespace instead\n") + xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)]) + except PageMissingError as pme: + xml = pme.xml + except ExportAbortedError: + pass + + header = xml.split('</mediawiki>')[0] + if not re.match(r"\s*<mediawiki", xml): + sys.stderr.write('XML export on this wiki is broken, quitting.\n') + logerror('XML export on this wiki is broken, quitting.') + sys.exit() + return header + +def mwGetXMLPage(config={}, pagetitle='', verbose=True): + """ Get the full history (or current only) of a page """ + + # if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated + # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F + + limit = 1000 + truncated = False + pagetitle_ = re.sub(' ', '_', pagetitle) + # do not convert & into %26, pagetitle_ = re.sub('&', '%26', pagetitle_) + data = {'title': config['mwexport'], 'pages': pagetitle_, 'action': 'submit'} + if config['curonly']: + data['curonly'] = 1 + data['limit'] = 1 + else: + data['offset'] = '1' # 1 always < 2000s + data['limit'] = limit + # in other case, do not set data['templates'] + if 'templates' in config and config['templates']: #fix, what is this option for? + data['templates'] = 1 + + xml = mwGetXMLPageCore(config=config, data=data) + if not xml: + raise ExportAbortedError(config['index']) + if not "</page>" in xml: + raise PageMissingError(data['title'], xml) + else: + # strip these sha1s sums which keep showing up in the export and + # which are invalid for the XML schema (they only apply to + # revisions) + xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml) + xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml) + + yield xml.split("</page>")[0] + + # if complete history, check if this page history has > limit edits, + # if so, retrieve all revisions using offset if available + # else, warning about Special:Export truncating large page histories + r_timestamp = r'<timestamp>([^<]+)</timestamp>' + numedits = 0 + numedits += len(re.findall(r_timestamp, xml)) + + # search for timestamps in xml to avoid analysing empty pages like + # Special:Allpages and the random one + if not config['curonly'] and re.search(r_timestamp, xml): + while not truncated and data['offset']: # next chunk + # get the last timestamp from the acum XML + # assuming history is sorted chronologically + data['offset'] = re.findall(r_timestamp, xml)[-1] + try: + xml2 = mwGetXMLPageCore(config=config, data=data) + except MemoryError: + sys.stderr.write("Page history exceeds our memory, halving limit.\n") + data['limit'] = data['limit'] / 2 + continue + + # are there more edits in this next XML chunk or no <page></page>? + if re.findall(r_timestamp, xml2): + if re.findall(r_timestamp, xml2)[-1] == data['offset']: + # again the same XML, this wiki does not support params in + # Special:Export, offer complete XML up to X edits (usually + # 1000) + sys.stderr.write('ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated\n') + truncated = True + break + else: + """ </namespaces> + </siteinfo> + <page> + <title>Main Page + 15580374 + edit=sysop:move=sysop (?) + + 418009832 + 2011-03-09T19:57:06Z + + """ + # offset is OK in this wiki, merge with the previous chunk + # of this page history and continue + try: + xml2 = xml2.split("")[0] + yield ' ' + (''.join(xml2.split('')[1:])) + except MemoryError: + sys.stderr.write("Page's history exceeds our memory, halving limit.\n") + data['limit'] = data['limit'] / 2 + continue + xml = xml2 + numedits += len(re.findall(r_timestamp, xml)) + else: + data['offset'] = '' # no more edits in this page history + yield "\n" + + if verbose: + if numedits == 1: + sys.stderr.write(' %s, 1 edit\n' % (pagetitle)) + else: + sys.stderr.write(' %s, %d edits\n' % (pagetitle, numedits)) + +def mwGetXMLPageCore(config={}, data={}): + """ Returns a XML containing data['limit'] revisions (or current only), ending in + if retrieving data['limit'] revisions fails, returns current only version + if all fail, returns empty string + """ + + xml = '' + cretries = 0 + maxseconds = 100 # max seconds to wait in a single sleeping + maxretries = config['retries'] # x retries and exit + increment = 20 # increment seconds every retry + + while not re.search(r'', xml): + if cretries > 0 and cretries < maxretries: + wait = increment * cretries < maxseconds and increment * \ + cretries or maxseconds # incremental until maxseconds + sys.stderr.write(' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...\n' % (c, data['pages'], wait) + time.sleep(wait) + # reducing server load requesting smallest chunks (if curonly then + # limit = 1 from mother function) + if data['limit'] > 1: + data['limit'] = data['limit'] / 2 # half + if cretries >= maxretries: + sys.stderr.write(' We have retried %d times\n' % (cretries)) + sys.stderr.write(' MediaWiki error for "%s", probably network error...' % (data['pages'])) + # If it's not already what we tried: our last chance, preserve only the last revision... + # config['curonly'] means that the whole dump is configured to save only the last, + # data['curonly'] should mean that we've already tried this + # fallback, because it's set by the following if and passed to + # mwGetXMLPageCore + if not config['curonly'] and not 'curonly' in data: + sys.stderr.write(' Trying to save only the last revision for this page...\n') + data['curonly'] = 1 + logerror( + config=config, + text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % + (data['pages']) + ) + return mwGetXMLPageCore(config=config, data=data) + else: + sys.stderr.write(' Saving in error log, skipping...\n') + logerror( + config=config, + text='Error while retrieving last revision of "%s". Skipping.\n' % + (data['pages'])) + raise ExportAbortedError(config['index']) + return '' # empty xml + # FIXME HANDLE HTTP Errors HERE + try: + r = wikiteam.getURL(url=config['index'], data=data) + #handleStatusCode(r) + #r = fixBOM(r) + xml = fixBOM(r) + except: + sys.stderr.write(' Connection error\n') + xml = '' + cretries += 1 + + return xml + def main(): pass diff --git a/wikiteam/wikiteam.py b/wikiteam/wikiteam.py index 82c612a..d7c66ec 100644 --- a/wikiteam/wikiteam.py +++ b/wikiteam/wikiteam.py @@ -32,13 +32,19 @@ import urllib __version__ = "0.3.1" +""" +Stuff to check if works properly or re-add if needed: +* fixBOM +* sessions +""" + def avoidWikimediaProjects(config={}): """ Skip Wikimedia projects and redirect to the dumps website """ # notice about wikipedia dumps if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['wiki']): - sys.stderr.write('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!') - sys.stderr.write('Download Wikimedia dumps from https://dumps.wikimedia.org') + sys.stderr.write('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\n') + sys.stderr.write('Download Wikimedia dumps from https://dumps.wikimedia.org\n') """if not other['force']: print 'Thanks!' sys.exit()""" @@ -61,7 +67,7 @@ def createNewDump(config={}): import wikispaces wikispaces.wsCreateNewDump(config=config) else: - sys.stderr.write("Wikiengine %s not supported. Exiting." % (config['wikiengine'])) + sys.stderr.write("Wikiengine %s not supported. Exiting.\n" % (config['wikiengine'])) def createDumpPath(config={}): # creating path or resuming if desired @@ -70,7 +76,7 @@ def createDumpPath(config={}): originalpath = config['path'] # do not enter if resume is requested from begining while not config['other']['resume'] and os.path.isdir(config['path']): - sys.stderr.write('\nWarning!: "%s" path exists' % (config['path'])) + sys.stderr.write('\nWarning!: "%s" path exists\n' % (config['path'])) reply = '' while reply.lower() not in ['yes', 'y', 'no', 'n']: reply = input( @@ -80,16 +86,16 @@ def createDumpPath(config={}): config['other']['configfilename'])) if reply.lower() in ['yes', 'y']: if not os.path.isfile('%s/%s' % (config['path'], config['other']['configfilename'])): - sys.stderr.write('No config file found. I can\'t resume. Aborting.') + sys.stderr.write('No config file found. I can\'t resume. Aborting.\n') sys.exit() - sys.stderr.write('You have selected: YES') + sys.stderr.write('You have selected: YES\n') config['other']['resume'] = True break elif reply.lower() in ['no', 'n']: - sys.stderr.write('You have selected: NO') + sys.stderr.write('You have selected: NO\n') config['other']['resume'] = False config['path'] = '%s-%d' % (originalpath, c) - sys.stderr.write('Trying to use path "%s"...' % (config['path'])) + sys.stderr.write('Trying to use path "%s"...\n' % (config['path'])) c += 1 return config @@ -270,21 +276,21 @@ def getParameters(params=[]): # Not wiki? Exit if not args.wiki: - sys.stderr.write('ERROR: Provide a URL to a wiki') + sys.stderr.write('ERROR: Provide a URL to a wiki\n') parser.print_help() sys.exit(1) # Don't mix download params and meta info params if (args.pages or args.images) and \ (args.get_api or args.get_index or args.get_page_titles or args.get_image_names or args.get_wiki_engine): - sys.stderr.write('ERROR: Don\'t mix download params and meta info params') + sys.stderr.write('ERROR: Don\'t mix download params and meta info params\n') parser.print_help() sys.exit(1) # No download params and no meta info params? Exit if (not args.pages and not args.images) and \ (not args.get_api and not args.get_index and not args.get_page_titles and not args.get_image_names and not args.get_wiki_engine): - sys.stderr.write('ERROR: Use at least one download param or meta info param') + sys.stderr.write('ERROR: Use at least one download param or meta info param\n') parser.print_help() sys.exit(1) @@ -292,11 +298,11 @@ def getParameters(params=[]): cj = cookielib.MozillaCookieJar() if args.cookies: cj.load(args.cookies) - sys.stderr.write('Using cookies from %s' % args.cookies) + sys.stderr.write('Using cookies from %s\n' % args.cookies) # check user and pass (one requires both) if (args.user and not args.password) or (args.password and not args.user): - sys.stderr.write('ERROR: Both --user and --pass are required for authentication.') + sys.stderr.write('ERROR: Both --user and --pass are required for authentication.\n') parser.print_help() sys.exit(1) @@ -338,7 +344,7 @@ def getParameters(params=[]): if re.search( r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': - sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas") + sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n") sys.exit() else: ns = re.sub(' ', '', args.namespaces) @@ -350,12 +356,12 @@ def getParameters(params=[]): # Process namespace exclusions if args.exnamespaces: if re.search(r'[^\d, \-]', args.exnamespaces): - sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas") + sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n") sys.exit(1) else: ns = re.sub(' ', '', args.exnamespaces) if ns.lower() == 'all': - sys.stderr.write('You cannot exclude all namespaces.') + sys.stderr.write('You cannot exclude all namespaces.\n') sys.exit(1) else: exnamespaces = [int(i) for i in ns.split(',')] @@ -394,15 +400,25 @@ def getParameters(params=[]): # Get ready special variables (API for MediWiki, etc) if config['wikiengine'] == 'mediawiki': import mediawiki + config['mwexport'] = 'Special:Export' if not args.mwapi: config['mwapi'] = mediawiki.mwGetAPI(config=config) if not config['mwapi']: - sys.stderr.write('ERROR: Provide a URL to API') + sys.stderr.write('ERROR: Provide a URL to API\n') sys.exit(1) + else: + data={ + 'action': 'query', + 'meta': 'siteinfo', + 'siprop': 'namespaces', + 'format': 'json'} + r = getURL(config['mwapi'], data=data) + config['mwexport'] = getJSON(r)['query']['namespaces']['-1']['*'] \ + + ':Export' if not args.mwindex: config['mwindex'] = mediawiki.mwGetIndex(config=config) if not config['mwindex']: - sys.stderr.write('ERROR: Provide a URL to Index.php') + sys.stderr.write('ERROR: Provide a URL to Index.php\n') sys.exit(1) elif wikiengine == 'wikispaces': import wikispaces @@ -415,14 +431,14 @@ def getParameters(params=[]): return config def getURL(url='', data=None): + # fix quizas pasandole el config pueda saber si esta definido el campo session y usarlo si interesa con un if html = '' - req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' }) - html = urllib.request.urlopen(req, data=data).read().decode().strip() try: + data = urllib.parse.urlencode(data).encode() req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' }) html = urllib.request.urlopen(req, data=data).read().decode().strip() except: - sys.stderr.write("Error while retrieving URL", url) + sys.stderr.write("Error while retrieving URL: %s\n" % url) sys.exit() return html @@ -517,36 +533,42 @@ def getWikiEngine(url=''): return wikiengine.lower() +def fixBOM(r): + """Strip Unicode BOM""" + if request.text.startswith(u'\ufeff'): + request.encoding = 'utf-8-sig' + return request.text + def handleStatusCode(response): statuscode = response.status_code if statuscode >= 200 and statuscode < 300: return - sys.stderr.write("HTTP Error %d." % statuscode) + sys.stderr.write("HTTP Error %d.\n" % statuscode) if statuscode >= 300 and statuscode < 400: - sys.stderr.write("Redirect should happen automatically: please report this as a bug.") - sys.stderr.write(response.url) + sys.stderr.write("Redirect should happen automatically: please report this as a bug.\n") + sys.stderr.write('%s\n' % response.url) elif statuscode == 400: - sys.stderr.write("Bad Request: The wiki may be malfunctioning.") - sys.stderr.write("Please try again later.") - sys.stderr.write(response.url) + sys.stderr.write("Bad Request: The wiki may be malfunctioning.\n") + sys.stderr.write("Please try again later.\n") + sys.stderr.write('%s\n' % response.url) sys.exit(1) elif statuscode == 401 or statuscode == 403: - sys.stderr.write("Authentication required.") - sys.stderr.write("Please use --userpass.") - sys.stderr.write(response.url) + sys.stderr.write("Authentication required.\n") + sys.stderr.write("Please use --userpass.\n") + sys.stderr.write('%s\n' % response.url) elif statuscode == 404: - sys.stderr.write("Not found. Is Special:Export enabled for this wiki?") - sys.stderr.write(response.url) + sys.stderr.write("Not found. Is Special:Export enabled for this wiki?\n") + sys.stderr.write('%s\n' % response.url) sys.exit(1) elif statuscode == 429 or (statuscode >= 500 and statuscode < 600): - sys.stderr.write("Server error, max retries exceeded.") - sys.stderr.write("Please resume the dump later.") - sys.stderr.write(response.url) + sys.stderr.write("Server error, max retries exceeded.\n") + sys.stderr.write("Please resume the dump later.\n") + sys.stderr.write('%s\n' % response.url) sys.exit(1) def resumePreviousDump(config={}): @@ -557,7 +579,7 @@ def resumePreviousDump(config={}): import wikispaces wikispaces.wsResumePreviousDump(config=config) else: - sys.stderr.write("Wikiengine %s not supported. Exiting." % (config['wikiengine'])) + sys.stderr.write("Wikiengine %s not supported. Exiting.\n" % (config['wikiengine'])) def saveConfig(config={}): """ Save config file """ @@ -566,19 +588,48 @@ def saveConfig(config={}): config2 = config.copy() config2['other'] = {} with open('%s/%s' % (config['path'], config['other']['configfilename']), 'w') as outfile: - sys.stderr.write('Saving config file...') + sys.stderr.write('Saving config file...\n') try: #str cPickle.dump(config2, outfile) except: #bytes with open('%s/%s' % (config['path'], config['other']['configfilename']), 'wb') as outfile: cPickle.dump(config2, outfile) +def savePageTitles(config={}, pagetitles=None): + pagetitlesfilename = '%s-%s-titles.txt' % ( + domain2prefix(config=config), config['date']) + with open('%s/%s' % (config['path'], pagetitlesfilename), 'wt') as f: + for pagetitle in pagetitles: + output = '%s\n' % (pagetitle) + f.write(output) + + # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times: + # main namespace and widget namespace. + # We can use sort -u in UNIX, but is it worth it? + f.write('--END--\n') + f.close() + sys.stderr.write('Page titles saved at... %s\n' % (pagetitlesfilename)) + +def undoHTMLEntities(text=''): + """ Undo some HTML codes """ + + # i guess only < > & " ' need conversion + # http://www.w3schools.com/html/html_entities.asp + text = re.sub('<', '<', text) + text = re.sub('>', '>', text) + text = re.sub('&', '&', text) + text = re.sub('"', '"', text) + text = re.sub(''', '\'', text) + + return text + def welcome(): """ Print opening message """ message = """ ######################################################################### # Welcome to WikiTeam's tools v%s (GPL v3) # +# Tools for downloading and preserving wikis # # More info at: https://github.com/WikiTeam/wikiteam # ######################################################################### @@ -605,10 +656,10 @@ def loadConfig(config={}): try: with open('%s/%s' % (config['path'], config['other']['configfilename']), 'r') as infile: - sys.stderr.write('Loading config file...') + sys.stderr.write('Loading config file...\n') config = cPickle.load(infile) except: - sys.stderr.write('ERROR: There is no config file. we can\'t resume. Start a new dump.') + sys.stderr.write('ERROR: There is no config file. we can\'t resume. Start a new dump.\n') sys.exit() return config @@ -616,17 +667,16 @@ def loadConfig(config={}): def main(params=[]): """ Main function """ + welcome() config = getParameters(params=params) avoidWikimediaProjects(config=config) config = createDumpPath(config=config) if config['other']['resume']: # Resume dump - welcome() config = loadConfig(config=config) resumePreviousDump(config=config) elif config['pages'] or config['images'] or config['logs']: # New dump - welcome() os.mkdir(config['path']) saveConfig(config=config) createNewDump(config=config) @@ -639,7 +689,7 @@ def main(params=[]): elif config['metainfo'] == 'get_page_titles': printPageTitles(config=config) elif config['metainfo'] == 'get_image_names': - printGetImageNames(config=config)) + printGetImageNames(config=config) elif config['metainfo'] == 'get_wiki_engine': sys.stdout.write(config['wikiengine']) sys.exit()