[^>]+)">' r_suballpages = '' r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">' if re.search(r_suballpages1, raw): r_suballpages = r_suballpages1 elif re.search(r_suballpages2, raw): r_suballpages = r_suballpages2 elif re.search(r_suballpages3, raw): r_suballpages = r_suballpages3 else: pass # perhaps no subpages # 3 is the current deep of English Wikipedia for Special:Allpages deep = 3 c = 0 checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: # load sub-Allpages m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') if r_suballpages == r_suballpages1: to = i.group('to') name = '%s-%s' % (fr, to) url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( config['index'], namespace, fr, to) # do not put urllib.quote in fr or to # fix, esta regexp no carga bien todas? o falla el r_title en # este tipo de subpag? (wikiindex) elif r_suballpages == r_suballpages2: # clean &namespace=\d, sometimes happens fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages/%s&namespace=%s' % ( config['index'], name, namespace) elif r_suballpages == r_suballpages3: fr = fr.split('&namespace=')[0] name = fr url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( config['index'], name, namespace) if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) wikiteam.delay(config=config) raw2 = wikiteam.getURL(url=url) raw2 = mwCleanHTML(raw2) rawacum += raw2 # merge it after removed junk sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \ len(re.findall(r_suballpages, raw2)), \ len(re.findall(r_title, raw2)))) wikiteam.delay(config=config) c += 1 c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = wikiteam.undoHTMLEntities(text=i.group('title')) if not t.startswith('Special:'): if t not in pagetitles: pagetitles.append(t) c += 1 sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace)) return pagetitles def mwGetXMLHeader(config={}): """ Retrieve a random page to extract XML header (namespace info, etc) """ pagetitle = 'Main_Page' try: xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)]) except PageMissingError as pme: # The <page> does not exist. Not a problem, if we get the <siteinfo>. xml = pme.xml except ExportAbortedError: # Issue 26: Account for missing "Special" namespace. # Hope the canonical special name has not been removed. # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases try: if config['mwapi']: sys.stderr.write("Trying the local name for the Special namespace instead\n") xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)]) except PageMissingError as pme: xml = pme.xml except ExportAbortedError: pass header = xml.split('</mediawiki>')[0] if not re.match(r"\s*<mediawiki", xml): sys.stderr.write('XML export on this wiki is broken, quitting.\n') logerror('XML export on this wiki is broken, quitting.') sys.exit() return header def mwGetXMLPage(config={}, pagetitle='', verbose=True): """ Get the full history (or current only) of a page """ # if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partially truncated # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F limit = 1000 truncated = False pagetitle_ = re.sub(' ', '_', pagetitle) # do not convert & into %26, pagetitle_ = re.sub('&', '%26', pagetitle_) data = {'title': config['mwexport'], 'pages': pagetitle_, 'action': 'submit'} if config['curonly']: data['curonly'] = 1 data['limit'] = 1 else: data['offset'] = '1' # 1 always < 2000s data['limit'] = limit # in other case, do not set data['templates'] if 'templates' in config and config['templates']: #fix, what is this option for? data['templates'] = 1 xml = mwGetXMLPageCore(config=config, data=data) if not xml: raise ExportAbortedError(config['index']) if not "</page>" in xml: raise PageMissingError(data['title'], xml) else: # strip these sha1s sums which keep showing up in the export and # which are invalid for the XML schema (they only apply to # revisions) xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml) xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml) yield xml.split("</page>")[0] # if complete history, check if this page history has > limit edits, # if so, retrieve all revisions using offset if available # else, warning about Special:Export truncating large page histories r_timestamp = r'<timestamp>([^<]+)</timestamp>' numedits = 0 numedits += len(re.findall(r_timestamp, xml)) # search for timestamps in xml to avoid analysing empty pages like # Special:Allpages and the random one if not config['curonly'] and re.search(r_timestamp, xml): while not truncated and data['offset']: # next chunk # get the last timestamp from the acum XML # assuming history is sorted chronologically data['offset'] = re.findall(r_timestamp, xml)[-1] try: xml2 = mwGetXMLPageCore(config=config, data=data) except MemoryError: sys.stderr.write("Page history exceeds our memory, halving limit.\n") data['limit'] = data['limit'] / 2 continue # are there more edits in this next XML chunk or no <page></page>? if re.findall(r_timestamp, xml2): if re.findall(r_timestamp, xml2)[-1] == data['offset']: # again the same XML, this wiki does not support params in # Special:Export, offer complete XML up to X edits (usually # 1000) sys.stderr.write('ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated\n') truncated = True break else: """ </namespaces> </siteinfo> <page> <title>Main Page

', raw): raw = raw.split('

')[1].split('

')[0] elif re.search('')[0] else: sys.stderr.write(raw[:250]) sys.stderr.write('This wiki doesn\'t use marks to split content\n') sys.exit() return raw def mwCleanXML(xml=''): """ Trim redundant info """ # do not touch XML codification, leave AS IS if re.search(r'\n', xml): xml = xml.split('\n')[1] if re.search(r'', xml): xml = xml.split('')[0] return xml def mwCreateNewDump(config={}): sys.stderr.write('Trying generating a new dump into a new directory...') if config['pages']: pagetitles = mwGetPageTitles(config=config) wikiteam.savePageTitles(config=config, pagetitles=pagetitles) mwGeneratePageDump(config=config, pagetitles=pagetitles) mwCheckXMLIntegrity(config=config, pagetitles=pagetitles) if config['images']: imagenames = mwGetImageNames(config=config) mwSaveImageNames(config=config, imagenames=imagenames) mwGenerateImageDump(config=config, imagenames=imagenames) if config['logs']: mwSaveLogs(config=config) mwSaveIndexPHP(config=config) mwSaveSpecialVersion(config=config) mwSaveSiteInfo(config=config) def mwCurateImageURL(config={}, url=''): """ Returns an absolute URL for an image, adding the domain if missing """ if 'mwindex' in config and config['mwindex']: # remove from :// (http or https) until the first / after domain domainalone = config['mwindex'].split( '://')[0] + '://' + config['mwindex'].split('://')[1].split('/')[0] elif 'mwapi' in config and config['mwapi']: domainalone = config['mwapi'].split( '://')[0] + '://' + config['mwapi'].split('://')[1].split('/')[0] else: sys.stderr.write('ERROR: no index nor API') sys.exit() if url.startswith('//'): # Orain wikifarm returns URLs starting with // url = '%s:%s' % (domainalone.split('://')[0], url) # is it a relative URL? elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): if url[0] == '/': # slash is added later url = url[1:] # concat http(s) + domain + relative url url = '%s/%s' % (domainalone, url) url = wikiteam.undoHTMLEntities(text=url) # url = urllib.unquote(url) #do not use unquote with url, it break some # urls with odd chars url = re.sub(' ', '_', url) return url def mwGeneratePageDump(config={}, pagetitles=None, start=None): """ Generates a XML dump for page titles """ sys.stderr.write('Retrieving XML for every page from "%s"' % (start or 'start')) header = mwGetXMLHeader(config=config) footer = '\n' # new line at the end xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history') xmlfile = '' lock = True if start: sys.stderr.write("Removing the last chunk of past XML dump: it is probably incomplete.\n") for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True): pass else: # requested complete xml dump lock = False xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') xmlfile.write(header) xmlfile.close() xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') c = 1 for pagetitle in mwGetPageTitles(config=config, start=start): if not pagetitle.strip(): continue if pagetitle == start: # start downloading from start, included lock = False if lock: continue wikiteam.delay(config=config) if c % 10 == 0: sys.stderr.write('Downloaded %d pages\n' % (c)) try: for xml in getXMLPage(config=config, title=title): xml = cleanXML(xml=xml) xmlfile.write(xml) except PageMissingError: logerror( config=config, text='The page "%s" was missing in the wiki (probably deleted)' % (title)) # here, XML is a correct chunk or # an empty string due to a deleted page (logged in errors log) or # an empty string due to an error while retrieving the page from server # (logged in errors log) c += 1 xmlfile.write(footer) xmlfile.close() sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename)) def mwGetAPI(config={}): """ Returns API for a MediaWiki wiki, if available """ api = '' html = wikiteam.getURL(url=config['wiki']) m = re.findall( r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>', html) if m: api = m[0] if api.startswith('//'): # gentoo wiki and others api = url.split('//')[0] + api return api def mwGetImageNames(config={}): """ Get list of image names """ sys.stderr.write('Retrieving image filenames\n') imagenames = [] if 'mwapi' in config and config['mwapi']: imagenames = mwGetImageNamesAPI(config=config) elif 'mwindex' in config and config['mwindex']: imagenames = mwGetImageNamesScraper(config=config) # imagenames = list(set(imagenames)) # it is a list of lists imagenames.sort() sys.stderr.write('%d image names loaded\n' % (len(imagenames))) return imagenames def mwGetImageNamesAPI(config={}): """ Retrieve file list: filename, url, uploader """ oldAPI = False aifrom = '!' imagenames = [] while aifrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: aifrom = '' if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']: if 'aicontinue' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages']['aicontinue'] elif 'aifrom' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages']['aifrom'] elif 'continue' in jsonimages: if 'aicontinue' in jsonimages['continue']: aifrom = jsonimages['continue']['aicontinue'] elif 'aifrom' in jsonimages['continue']: aifrom = jsonimages['continue']['aifrom'] # sys.stderr.write(aifrom) for image in jsonimages['query']['allimages']: url = image['url'] url = mwCurateImageURL(config=config, url=url) # encoding to ascii is needed to work around this horrible bug: # http://bugs.python.org/issue8136 if 'mwapi' in config and '.wikia.com' in config['mwapi']: #to avoid latest?cb=20120816112532 in filenames filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore') else: filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore') uploader = re.sub('_', ' ', image['user']) imagenames.append([filename, url, uploader]) else: oldAPI = True break if oldAPI: gapfrom = '!' imagenames = [] while gapfrom: sys.stderr.write('.') # progress # Some old APIs doesn't have allimages query # In this case use allpages (in nm=6) as generator for imageinfo # Example: # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! data = { 'action': 'query', 'generator': 'allpages', 'gapnamespace': 6, 'gaplimit': 500, 'gapfrom': gapfrom, 'prop': 'imageinfo', 'iiprop': 'user|url', 'format': 'json'} # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: gapfrom = '' if 'query-continue' in jsonimages and 'allpages' in jsonimages['query-continue']: if 'gapfrom' in jsonimages['query-continue']['allpages']: gapfrom = jsonimages['query-continue']['allpages']['gapfrom'] for image, props in jsonimages['query']['pages'].items(): url = props['imageinfo'][0]['url'] url = mwCurateImageURL(config=config, url=url) tmp_filename = ':'.join(props['title'].split(':')[1:]) filename = re.sub('_', ' ', tmp_filename) uploader = re.sub('_', ' ', props['imageinfo'][0]['user']) imagenames.append([filename, url, uploader]) else: # if the API doesn't return query data, then we're done break if len(imagenames) == 1: sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) return imagenames def mwGetImageNamesScraper(config={}): """ Retrieve file list: filename, url, uploader """ # (?\d+)&' imagenames = [] offset = '29990101000000' # january 1, 2999 limit = 5000 retries = config['retries'] while offset: # 5000 overload some servers, but it is needed for sites like this with # no next links # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= data={ 'title': 'Special:Imagelist', 'limit': limit, 'offset': offset} raw = wikiteam.getURL(url=config['index'], data=data) #handleStatusCode(r) wikiteam.delay(config=config) # delicate wiki if re.search(r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if limit > 10: sys.stderr.write('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)) limit = limit / 10 continue elif retries > 0: # waste retries, then exit retries -= 1 sys.stderr.write('Retrying...') continue else: sys.stderr.write('No more retries, exit...') break raw = mwCleanHTML(raw) # archiveteam 1.15.1 Yahoovideo.jpg (file) # wikanda 1.15.5 Fernandocg r_images1 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' # wikijuegos 1.9.5 # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old # mediawiki version r_images2 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' # gentoowiki 1.18 r_images3 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
r_images4 = r'(?im)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' r_images5 = ( r'(?im)\s*]*?>(?P[^>]+)\s*\([^<]*?\s*\)\s*\s*' '[^\n\r]*?\s*' '[^<]*?\s*' '\s*()?(?P[^<]+?)()?\s*') # Select the regexp that returns more results regexps = [r_images1, r_images2, r_images3, r_images4, r_images5] count = 0 i = 0 regexp_best = 0 for regexp in regexps: if len(re.findall(regexp, raw)) > count: count = len(re.findall(regexp, raw)) regexp_best = i i += 1 m = re.compile(regexps[regexp_best]).finditer(raw) # Iter the image results for i in m: url = i.group('url') url = mwCurateImageURL(config=config, url=url) filename = re.sub('_', ' ', i.group('filename')) filename = wikiteam.undoHTMLEntities(text=filename) filename = urllib.unquote(filename) uploader = re.sub('_', ' ', i.group('uploader')) uploader = wikiteam.undoHTMLEntities(text=uploader) uploader = urllib.unquote(uploader) imagenames.append([filename, url, uploader]) if re.search(r_next, raw): new_offset = re.findall(r_next, raw)[0] # Avoid infinite loop if new_offset != offset: offset = new_offset retries += 5 # add more retries if we got a page with offset else: offset = '' else: offset = '' if (len(imagenames) == 1): sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) imagenames.sort() return imagenames def mwGetIndex(config={}): """ Returns Index.php for a MediaWiki wiki, if available """ if config['mwapi']: mwapi = config['mwapi'] else: mwapi = mwGetAPI(config=config) index = '' html = wikiteam.getURL(url=config['wiki']) m = re.findall(r'