From 421df6d4bb3d8bdde9a864edbd0dbb709ebdce33 Mon Sep 17 00:00:00 2001 From: Hydriz Scholz Date: Fri, 4 Jul 2014 22:16:13 +0800 Subject: [PATCH 1/2] Spliting rewrite files into a separate branch --- rewrite/README.md | 4 - rewrite/dumpgenerator.py | 1292 -------------------------------------- 2 files changed, 1296 deletions(-) delete mode 100644 rewrite/README.md delete mode 100644 rewrite/dumpgenerator.py diff --git a/rewrite/README.md b/rewrite/README.md deleted file mode 100644 index ff4a469..0000000 --- a/rewrite/README.md +++ /dev/null @@ -1,4 +0,0 @@ -## WikiTeam dumpgenerator.py rewrite -This is the rewrite of WikiTeam's dumpgenerator.py. It is aimed towards getting native API support when downloading wikis and to avoid the use of screen scraping when doing so (which is quite hacky and not ideal). - -Note: THIS IS NOT A RELEASE YET, patches welcome. diff --git a/rewrite/dumpgenerator.py b/rewrite/dumpgenerator.py deleted file mode 100644 index ef3ba66..0000000 --- a/rewrite/dumpgenerator.py +++ /dev/null @@ -1,1292 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (C) 2013 Hydriz Scholz -# Copyright (C) 2014 WikiTeam -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program. If not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit -# - -####################################################################### -# dumpgenerator.py is a script to generate backups of MediaWiki wikis # -# To learn more, read the documentation: # -# http://code.google.com/p/wikiteam/wiki/NewTutorial # -####################################################################### - -# For developers: -# * All functions and classes are displayed in alphabetical order for easier accessibility. -# * Script exit codes reference: -# * 0 - Script ran well without problems -# * 1 - Script failed due to user's incorrect use -# * 2 - Script failed due to destination server issue -# * For testing purposes, add the --debug parameter and edit DumpGenerator.debug() accordingly. - -###### -# TODO LIST -# 0. Download index.html and Special:Version.html -# 1. Index.php support. -# 2. Special:Log pages support -# 3. GUI (Question and Answer if no parameters are given) -# 4. Resuming of dump -# 5. Place the images in various folders so as to avoid hitting the limit of number of files in a directory -# 6. Speed up the script. A run with --xml --images on test.wikidata.org came up with 9 min 23 sec on 2.0 and 3 min 58 sec on 1.0 - -# WHAT IS WORKING -# 1. XML dumping -# 2. Complete dumping using API (except for --logs) -# 3. Automatic updating -# 4. Dumping of XML based on a list of titles -# 5. Integrity check for XML dump - -import datetime -import getopt -import hashlib -import json -import os -import re -import shutil -import sys -import time -import urllib -import urllib2 -import xml.etree.ElementTree as ElementTree - -class DumpGenerator: - """ - The main class that powers and operates everything else - """ - def __init__(self): - """ - Main constructor class for DumpGenerator, registers important variables too. - """ - self.Version = "2.0" - self.revision = "1" - # Provide a cool user-agent to hide the fact that this is a script - self.UserAgent = "Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0" - self.useAPI = False - self.useIndex = False - self.prefix = "" - self.domain = "" - self.tasklist = [] - self.configfile = "config.json" - self.configoptions = { - "date": "", - "useAPI": False, - "useIndex": False, - "urltoapi": "", - "urltoindex": "", - "images": False, - "logs": False, - "xml": False, - "curonly": False, - "exnamespaces": "", - "titlesonly": False - } - - # Basic metadata - self.date = datetime.datetime.now().strftime('%Y%m%d') - - # Important URLs - self.urltoapi = "" - self.urltoindex = "" - - # Type of dump to generate - self.images = False - self.logs = False - self.xml = False - - # Resuming of previous dump - self.resume = False - self.path = "" - - # Additional information for XML - self.curonly = False - self.exnamespaces = "" - self.titlesonly = False - self.titles = "" - - # Others - self.cookies = "" - self.delay = 0 - self.debugmode = False - self.nolog = False - self.autonomous = False - - # Short options: string (no commas), long options: array - # More information about these options are at self.help() - self.shortoptions = "hv" - self.longoptions = [ "help", "api=", "index=", "curonly", "images", "logs", "xml", "auto", "delay=", "cookies=", "exnamespaces=", "resume", "path=", "debug", "nolog", "titlesonly", "titles=" ] - - def bye(self): - """ - Bid farewell to the user at the very end of the script when everything - has been successful. - - Returns: Goodbye message. - """ - message = """---> Congratulations! Your dump is complete <--- -If you have suggestions, file a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list -If this is a public wiki, do consider publishing this dump so others can benefit from it. Follow the steps as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam. -Thank you for using DumpGenerator %s by WikiTeam, good bye!""" % ( self.Version ) - return message - - def checkAPI(self): - """ - Checks the validity of the api.php. - """ - query = { - "meta": "siteinfo", - "siprop": "general" } - sitestats = json.loads( RequestAPI.query( query ) ) - try: - if ( sitestats[ "query" ][ "general" ][ "server" ] in self.urltoapi ): - return True - except: - try: - if ( sitestats[ "error" ][ "code" ] == "readapidenied" ) and ( self.cookies == "" ): - Output.warn( "The wiki is private and we do not have proper authentication information!" ) - return False - except: - Output.warn( "This api.php seems weird or is not valid." ) - return False - - def checkIndex(self): - """ - Checks the validity of the index.php. - """ - # TODO: Screen scraping is involved here, need backward compact for older version of MediaWiki. - parameters = { "title": "Special:Version" } - request = RequestIndex.query( parameters ) - # Since we are at Special:Version, we should not be getting Special:BadTitle unless we are not logged in - if ( re.search( r'(Special:Badtitle)', request ) ) and ( self.cookies == "" ): - Output.error( "The wiki is private and we do not have proper authentication information!" ) - sys.exit(1) - - # Check for some tags within the Special:Version page, must be language-independent - if ( re.search( r'(

|meta name="generator" content="MediaWiki)', request ) ): - return True - - def debug(self): - """ - A temporary debug mode for testing purposes. - REMOVE WHEN COMPLETE! - """ - print "DEBUG MODE ON" - print "Date: %s" % (self.date) - print "URL to api.php: %s" % (self.urltoapi) - print "URL to index.php: %s" % (self.urltoindex) - print "Current revision only: %s" % (self.curonly) - print "Image dump: %s" % (self.images) - print "Log dump: %s" % (self.logs) - print "XML dump: %s" % (self.xml) - print "Resume: %s" % (self.resume) - print "Path for resuming: %s" % (self.path) - print "Delay: %s" % (self.delay) - print "Cookies file: %s" % (self.cookies) - print "Excluded namespaces: %s" % (self.exnamespaces) - print "Debug mode on: %s" % (self.debugmode) - self.tasklist = sorted( self.tasklist ) - for task in self.tasklist: - if ( task == "axml" ): - DumpXML.run() - elif ( task == "bimages" ): - DumpImages.run() - elif ( task == "clogs" ): - DumpLogs.run() - sys.exit(0) - - def downloadHtmlPages(self): - """ - Downloads the HTML pages such as the main page and Special:Version. - """ - # Download the main page - Output.message( "Downloading index.php (Main Page) as index.html." ) - query = {} - index = RequestIndex.query( query ) - index = RequestIndex.removeIP( index ) - if ( os.path.exists( "Special:Version.html" ) ): - os.remove( "index.html" ) - else: - pass - for line in index: - Output.appendToFile( "index.html", line ) - - # Download Special:Version or its respective localized version - Output.message( "Downloading Special:Version with extensions and other related info." ) - query = { "title": "Special:Version" } - SpecialVersion = RequestIndex.query( query ) - SpecialVersion = RequestIndex.removeIP( SpecialVersion ) - if ( os.path.exists( "Special:Version.html" ) ): - os.remove( "Special:Version.html" ) - else: - pass - for line in SpecialVersion: - Output.appendToFile( "Special:Version.html", line ) - - def fixHTMLEntities(self, text): - """ - Convert some HTML entities to their regular characters. - """ - text = re.sub('<', '<', text) - text = re.sub('>', '>', text) - text = re.sub('&', '&', text) - text = re.sub('"', '"', text) - text = re.sub(''', '\'', text) - return text - - def help(self): - """ - Provides vital help information to the user. This function - directly uses the "print" function because it is harmless and - what needs to be logged has already been done so. - - Returns: Help message text - """ - message = """DumpGenerator %s, a script to generate backups of MediaWiki wikis. -For more information, please see: http://code.google.com/p/wikiteam/wiki/NewTutorial - -Startup: - -h, --help Displays this help information and exits. - -v, --version Displays the version of this script, with additional credits. - -Wiki information: - --api=URL The URL to the wiki's api.php, not to be used with --index. - --index=URL The URL to the wiki's index.php, not to be used with --api. - -Options: - --xml Creates an XML dump. - --images Creates an image dump. - --logs Creates a dump of all log pages (not yet supported). - -XML dump (only if --xml is used): - --curonly Download only the current revision. - --exnamespaces The unique system number(s) for namespaces to exclude, separated by commas. - --titlesonly Download only the page titles without the actual content. - --titles Path to a file containing list of titles, requires "--END--" to be on the last line. - -Other: - --auto Enable auto pilot mode (select options that ensures that the script creates a new dump). - --resume Resume an incomplete dump (requires --path to be given). - --path=PATH Path to the incomplete dump. - --delay=SECONDS Adds a delay (in seconds) between requests. - --cookies=PATH Path to a Mozilla cookies.txt file for authentication cookies. - --nolog Disable logging to dumpgenerator.log (does not affect output in terminal). - -Report any issues to our issue tracker: https://code.google.com/p/wikiteam.""" % (self.Version) - return message - - def loadConfig(self): - """ - Load a config file from a partially-made dump. - """ - config = json.loads( self.configfile ) - self.date = config[ "date" ] - self.useAPI = config[ "useAPI" ] - self.useIndex = config[ "useIndex" ] - self.urltoapi = config[ "urltoapi" ] - self.urltoindex = config[ "urltoindex" ] - self.images = config[ "images" ] - self.logs = config[ "logs" ] - self.xml = config[ "xml" ] - self.curonly = config[ "curonly" ] - self.exnamespaces = config[ "exnamespaces" ] - self.titlesonly = config[ "titlesonly" ] - - if ( self.images == True ): - self.tasklist.append( "bimage" ) - if ( self.logs == True ): - self.tasklist.append( "clogs" ) - if ( self.xml == True ): - self.tasklist.append( "axml" ) - - if ( self.useAPI == True ): - domain = self.urltoapi - elif ( self.useIndex == True ): - domain = self.urltoindex - - def makePrefix(self, domain): - """ - Converts a domain to a prefix. - - Inputs: - - domain: The domain to change, may contain api.php or index.php as suffix. - - Returns: - - string with slashes and stray characters changed to underscores, suffix - removed and URL protocol removed. - """ - domain = domain.lower() - # Remove unnecessary prefixes and suffixes - domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain) - # Substitute directory slashes with underscores - domain = re.sub(r'/', '_', domain) - # Convert any stray character that is not in the alphabet to underscores - domain = re.sub(r'[^-.A-Za-z0-9]', '_', domain) - return domain - - def makeNiceURL(self, domain): - """ - Converts a domain to a more human-readable format (used for uploading). - - Inputs: - - domain: The domain to change, may contain api.php or index.php as suffix. - - Returns: - - string with suffix removed. - """ - domain = domain.lower() - # Remove the suffixes - domain = re.sub(r'(/index\.php|/api\.php)', '', domain) - return domain - - def processargs(self): - """ - Processing arguments and options provided by the user. - """ - try: - options, answers = getopt.getopt( sys.argv[1:], self.shortoptions, self.longoptions ) - except getopt.GetoptError: - Output.error( "An unknown option has been specified, please check your arguments before re-running!" ) - sys.exit(1) - - # First accept all arguments and store them in a variable - for option, answer in options: - # Startup - if ( option in ( "-h", "--help" ) ): - # Display the help guide and exit - print self.help() - os.remove( Output.logfile ) - sys.exit(0) - elif ( option in ( "-v", "--version" ) ): - # Display the version of this script - print self.version() - os.remove( Output.logfile ) - sys.exit(0) - - # Wiki information - elif ( option in "--api" ): - self.urltoapi = answer - self.configoptions[ "urltoapi" ] = self.urltoapi - elif ( option in "--index" ): - self.urltoindex = answer - self.configoptions[ "urltoindex" ] = self.urltoindex - - # Dump options - elif ( option == "--images" ): - self.images = True - self.configoptions[ "images" ] = True - self.tasklist.append( "bimages" ) - elif ( option == "--logs" ): - self.logs = True - self.configoptions[ "logs" ] = True - self.tasklist.append( "clogs" ) - elif ( option == "--xml" ): - self.xml = True - self.configoptions[ "xml" ] = True - self.tasklist.append( "axml" ) - - # XML dump options - elif ( option == "--curonly" ): - self.curonly = True - self.configoptions[ "curonly" ] = True - elif ( option in "--exnamespaces" ): - self.exnamespaces = answer - self.configoptions[ "exnamespaces" ] = self.exnamespaces - elif ( option == "--titlesonly" ): - self.titlesonly = True - self.configoptions[ "titlesonly" ] = True - elif ( option in "--titles" ): - self.titles = os.path.abspath( answer ) - - # Other options - elif ( option == "--auto" ): - self.autonomous = True - elif ( option in "--cookies" ): - self.cookies = answer - elif ( option in "--delay" ): - self.delay = answer - elif ( option == "--nolog" ): - self.nolog = True - elif ( option in "--path" ): - self.path = answer - elif ( option == "--resume" ): - self.resume = True - - # Private options (i.e. usable but not documented in --help) - elif ( option == "--debug" ): - self.debugmode = True - else: - Output.error( "An unknown option has been specified, please check your arguments before re-running!" ) - sys.exit(1) - - # Now to verify that the user is not messing around - if ( self.urltoapi == "" and self.urltoindex == "" ): - # User did not specify either --api= or --index= - if ( self.resume == True and self.path != "" ): - # ...but specified --resume and --path= accordingly - self.resumeDump() - elif ( self.resume == True and self.path == "" ): - # ...and specified --resume without --path= - Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" ) - sys.exit(1) - else: - Output.error( "You need to tell me the URL to either the api.php or to index.php!" ) - sys.exit(1) - elif ( self.resume == True ) and ( self.path == "" ): - # User specified --resume, but no --path= was given - Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" ) - sys.exit(1) - elif ( self.urltoapi != "" and self.urltoindex != "" ): - # User specified both --api= and --index= - self.useAPI = True - elif ( self.xml == False and ( self.curonly == True or self.exnamespaces != "" ) ): - # User specified --curonly and --exnamespaces without --xml - Output.error( "You did not specify to make an XML dump using --xml, so why write --curonly or --exnamespaces? Remove them before re-running!" ) - sys.exit(1) - - if ( self.urltoapi != "" ): - self.useAPI = True - elif ( self.urltoindex != "" ): - self.useIndex = True - - if ( self.useAPI == True ): - Output.message( "Checking api.php..." ) - if not ( self.urltoapi.startswith( "http://" ) ) and not ( self.urltoapi.startswith( "https://" ) ): - Output.error( "The URL to api.php must start with either http:// or https://!" ) - sys.exit(1) - elif ( self.checkAPI() ): - Output.message( "api.php is okay" ) - else: - Output.error( "There is an error with api.php, please provide a correct path to it." ) - sys.exit(1) - elif ( self.useIndex == True ): - Output.message( "Checking index.php..." ) - if not ( self.urltoindex.startswith( "http://" ) ) and not ( self.urltoindex.startswith( "https://" ) ): - Output.error( "The URL to index.php must start with either http:// or https://!" ) - sys.exit(1) - elif ( self.checkIndex() ): - Output.message( "index.php is okay" ) - else: - Output.error( "There is an error with index.php, please provide a correct path to it." ) - sys.exit(1) - - def resumeDump(self): - """ - Resume an incomplete dump defined in self.path. - """ - # TODO: Add support for resuming dumps. - os.chdir( self.path ) - self.loadConfig() - self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date ) - self.domain = self.makeNiceURL( domain ) - if ( self.useAPI == True ): - self.urltoindex = "%s/index.php" % ( self.domain ) - self.tasklist = sorted( self.tasklist ) - for task in self.tasklist: - if ( task == "axml" ): - DumpXML.run() - elif ( task == "bimages" ): - DumpImages.run() - elif ( task == "clogs" ): - DumpLogs.run() - - def run(self): - """ - Run the whole script itself and excute important functions. - """ - print self.welcome() - Updater.checkRevision() - # Check if previously there was a log file in the working directory and remove it if exists - # This is followed by the equivalent of "touch" in Unix to create an empty file - if ( os.path.exists( Output.logfile ) ): - os.remove( Output.logfile ) - open( Output.logfile, "a" ).close() - else: - open( Output.logfile, "a" ).close() - self.processargs() - if ( DumpGenerator.nolog or DumpGenerator.debugmode): - # Remove the dumpgenerator.log file - os.remove( Output.logfile ) - if ( self.useAPI == True ): - domain = self.urltoapi - elif ( self.useIndex == True ): - domain = self.urltoindex - directories = os.walk( "." ).next()[1] - for directory in directories: - # Check if there is a dump that already exists in the current working directory - if ( directory.startswith( self.makePrefix( domain ) ) and directory.endswith( "-wikidump" ) ): - print "" # Create a blank line - Output.warn( "There seems to be a similar dump at %s which might be incomplete." % ( directory ) ) - if ( self.autonomous == True ): - Output.message( "Since auto pilot mode is enabled, that dump will not be resumed." ) - self.resume = False - else: - Output.warn( "Do you wish to resume using configuration from that dump? [yes, y], [no, n]" ) - reply = "" - while reply.lower() not in [ "yes", "y", "no", "n" ]: - reply = raw_input( "Answer: " ) - if ( reply.lower() in [ "yes", "y" ] ): - if not ( os.path.isfile( "%s/%s" % ( directory, self.configfile ) ) ): - Output.error( "I cannot find a %s in the directory! Please delete that directory before re-running!" % ( self.configfile ) ) - sys.exit(1) - else: - Output.warn( "Resuming dump and ignoring configuration given in this session..." ) - self.resume = True - self.path = directory - break - elif ( reply.lower() in [ "no", "n" ] ): - Output.message( "Not resuming..." ) - self.resume = False - else: - continue - if ( self.resume == True ): - self.resumeDump() - else: - self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date ) - self.domain = self.makeNiceURL( domain ) - workingdir = "%s-wikidump" % ( self.prefix ) - if ( os.path.exists( workingdir ) ): - if ( self.autonomous == True ): - Output.message( "Since auto pilot mode is enabled, the directory with the same name will be deleted." ) - reply = "yes" - else: - Output.warn( "\nThere seems to be a directory with the same name, delete the old one? [yes, y], [no, n]" ) - reply = "" - while reply.lower() not in [ "yes", "y", "no", "n" ]: - reply = raw_input( "Answer: " ) - if ( reply.lower() in [ "yes", "y" ] ): - try: - shutil.rmtree( workingdir ) - except: - Output.error( "There was a problem deleting the directory, please manually delete it before re-running!" ) - sys.exit(1) - print "" # Create a blank line - elif ( reply.lower() in [ "no", "n" ] ): - Output.error( "Existing directory exists, either delete that directory or rename it before re-running!" ) - sys.exit(1) - else: - pass - Output.message( "Generating a new dump into a new directory..." ) - os.mkdir( workingdir ) - os.rename( Output.logfile, "%s/%s" % ( workingdir, Output.logfile ) ) - os.chdir( workingdir ) - self.saveConfig() - # Guess the URL to index.php - if ( self.useAPI == True ): - self.urltoindex = "%s/index.php" % ( self.domain ) - if ( self.debugmode == True ): - self.debug() - else: - # Run every single task that we are assigned to do in order: xml, images, logs - # The "a", "b" and "c" prefix is just to force the order. - self.tasklist = sorted( self.tasklist ) - if ( self.tasklist == [] ): - Output.error( "You did not tell me what dump to create!" ) - else: - for task in self.tasklist: - if ( task == "axml" ): - DumpXML.run() - elif ( task == "bimages" ): - DumpImages.run() - elif ( task == "clogs" ): - DumpLogs.run() - self.downloadHtmlPages() - print self.bye() - - def saveConfig(self): - """ - Save the configuration settings provided. - """ - self.configoptions[ "date" ] = self.date - output = open( self.configfile, "w" ) - json.dump( self.configoptions, output, indent=4 ) - - def version(self): - """ - Displays the version information and credits of the script. - - Returns: Version information and credits - """ - message = """DumpGenerator %s by WikiTeam - -Copyright (C) 2013 Hydriz Scholz -Copyright (C) 2014 WikiTeam - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program. If not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit - -""" % (self.Version) - return message - - def welcome(self): - """ - Welcomes the user at the very beginning of the script running process. - - Returns: Welcome message. - """ - message = """########## Welcome to DumpGenerator %s by WikiTeam ##########\n""" % (self.Version) - return message - -class DumpImages: - """ - The class for generating an image dump. - """ - def __init__(self): - """ - The constructor function. - """ - self.files = [] - - def dumpImages(self): - """ - Download all the images on the wiki with their corresponding XML. - """ - if ( DumpGenerator.useAPI == True ): - self.getFileListAPI() - else: - self.getFileListIndex() - filecount = 0 - if ( self.files == [] ): - pass - else: - Output.message( "Downloading files and their descriptions into \"images\" directory..." ) - for media in self.files: - time.sleep( DumpGenerator.delay ) # Delay between requests - urllib.urlretrieve( media[ "url" ], "images/%s" % (media[ "name" ] ) ) - title = DumpGenerator.fixHTMLEntities( media[ "title" ].encode( "utf-8" ) ) - contentsfile = DumpXML.getXMLPage( title, siteinfo=True ) - destfile = "images/%s.xml" % ( media[ "name" ] ) - shutil.move( contentsfile, destfile ) - Output.appendToFile( destfile, "\n" ) - filecount += 1 - if ( filecount % 10 == 0 ): - # Give the user a regular status report so that it does not look stuck - Output.message( " Downloaded %d files." % ( filecount ) ) - if ( filecount == 1 ): - Output.message( "Downloaded 1 file." % ( filecount ) ) - else: - Output.message( "Downloaded %d files." % ( filecount ) ) - - def getFileListAPI(self): - """ - Download the list of files on the wiki via the API. - """ - files = [] - dumpfile = "%s-images.txt" % ( DumpGenerator.prefix ) - filecount = 0 - Output.message( "Getting list of files on the wiki..." ) - aifrom = "!" # Very first page of a wiki - while aifrom: - sys.stderr.write('.') # Tell the user that downloading is in progress - query = { - "list": "allimages", - "aifrom": aifrom, - "ailimit": 500 } # The default limit for anonymous users of the API is 500 pages per request - time.sleep( DumpGenerator.delay ) # Delay between requests - filesmeta = json.loads( RequestAPI.query( query ) ) - # Store what the server tells us to continue from - try: - serveraifrom = filesmeta[ "query-continue" ][ "allimages" ][ "aicontinue" ] - aifrom = DumpGenerator.fixHTMLEntities( serveraifrom ) - except: - # Reached the end of having to keep continuing, exit the while condition - aifrom = "" - # TODO: On a wiki with a lot of files, this can cause huge memory problems - files.extend( filesmeta[ "query" ][ "allimages" ] ) - for media in filesmeta[ "query" ][ "allimages" ]: - outputline = "%s\t%s\n" % ( media[ "title" ], media[ "url" ] ) - Output.appendToFile( dumpfile, outputline ) - # Add to namespace page count - filecount += len( files ) - Output.appendToFile( dumpfile, "--END--" ) - if ( filecount == 1 ): - Output.message( " Got 1 file" ) - else: - Output.message( " Got %d files" % ( filecount ) ) - - if ( filecount == 0 ): - Output.warn( "There are no files on the wiki to download!" ) - else: - Output.message( "File names and URLs saved at %s." % ( dumpfile ) ) - self.files = files - - def getFileListIndex(self): - """ - Download the list of files on the wiki via index.php. - """ - # TODO: Add code here - - def run(self): - """ - Execute the process of producing an image dump. - """ - if ( os.path.isdir( "images" ) ): - time.sleep(0) - else: - os.mkdir( "images" ) - self.dumpImages() - -class DumpLogs: - """ - The class for generating a log pages dump (pages in Special:Log). - """ - def __init__(self): - """ - The constructor function. - """ - - def run(self): - """ - Execute the process of producing a log pages dump. - """ - # TODO: Support downloading of log pages - Output.warn( "Sorry, downloading of log pages are not yet supported!" ) - -class DumpXML: - """ - The class for generating an XML dump. - """ - def __init__(self): - """ - The constructor function. - """ - self.lennamespaces = 0 - self.namespaces = {} - self.pagetitles = [] - self.titlesdumpfile = "" - self.dumpretrycount = 0 - - def dumpPageTitlesAPI(self): - """ - Get a list of page titles and outputs it to a file. - """ - self.getNamespacesAPI() - self.getPageTitlesAPI() - Output.message( "Saving list of page titles..." ) - Output.appendToFile( self.titlesdumpfile, "--END--" ) - Output.message( "List of page titles saved at %s." % ( self.titlesdumpfile ) ) - - def dumpXML(self): - """ - Get the whole wiki in an XML file. - """ - Output.message( "Downloading the XML of every page..." ) - if ( DumpGenerator.curonly == True ): - dumpfile = "%s-curonly.xml" % ( DumpGenerator.prefix ) - else: - dumpfile = "%s-history.xml" % ( DumpGenerator.prefix ) - pagecount = 0 - # To reduce memory usage, we are storing the title into memory only when we need it - for title in file( self.titlesdumpfile, "r" ).read().splitlines(): - pagecount += 1 - numberofedits = 0 - # Add the initial siteinfo and header tags for the first page - if ( pagecount == 1 ): - contentsfile = self.getXMLPage( title, siteinfo=True ) - contents = file( contentsfile, "r" ).readlines() - open( dumpfile, "a" ).close() # "touch" the file - os.remove( contentsfile ) - elif ( title == "--END--" ): - contents = [ "\n" ] - else: - contentsfile = self.getXMLPage( title ) - contents = file( contentsfile, "r" ).readlines() - os.remove( contentsfile ) - - for content in contents: - # Count the number of occurrences of "" to determine number of revisions - if ( "" in content ): - numberofedits += 1 - Output.appendToFile( dumpfile, content ) - if ( title == "--END--" ): - pass - else: - if ( numberofedits == 1 ): - Output.message( " %s, 1 edit" % ( title ) ) - else: - Output.message( " %s, %s edits" % ( title, numberofedits ) ) - if ( pagecount % 10 == 0 ): - Output.message( "Downloaded %d pages" % ( pagecount ) ) - Output.message( "XML dump saved at %s." % ( dumpfile ) ) - self.integrityCheck( dumpfile ) - - def getNamespacesAPI(self): - """ - Download the list of namespaces with their names and IDs - via the API. - """ - query = { - "meta": "siteinfo", - "siprop": "namespaces" } - namespacedetails = json.loads( RequestAPI.query( query ) ) - namespacenums = namespacedetails[ "query" ][ "namespaces" ].keys() - # Remove the system namespaces ("Media" and "Special") - namespacenums.remove( "-2" ) - namespacenums.remove( "-1" ) - namespaces = {} - for namespacenum in namespacenums: - namespacename = namespacedetails[ "query" ][ "namespaces" ][ namespacenum ][ "*" ] - namespaces[ namespacenum ] = namespacename - self.lennamespaces = len( list( namespacenums ) ) - Output.message( "%d namespaces found." % ( self.lennamespaces ) ) - self.namespaces = namespaces - - def getPageTitlesAPI(self): - """ - Grab a list of page titles in each namespace via the API. - - There are leading spaces in the outputs so as to make things neater on the terminal. - """ - titles = [] - self.titlesdumpfile = "%s-titles.txt" % ( DumpGenerator.prefix ) - totalpagecount = 0 - for namespace in self.namespaces: - if namespace in DumpGenerator.exnamespaces: - Output.warn( " Skipping namespace %s" % (namespace) ) - else: - pagecount = 0 - Output.message( " Getting titles in namespace %s" % (namespace) ) - apfrom = "!" # Very first page of a wiki - while apfrom: - sys.stderr.write( "." ) # Tell the user that downloading is in progress - query = { - "list": "allpages", - "apnamespace": namespace, - "apfrom": apfrom, - "aplimit": 500 } # The default limit for anonymous users of the API is 500 pages per request - time.sleep( DumpGenerator.delay ) # Delay between requests - pagetitles = json.loads( RequestAPI.query( query ) ) - # Store what the server tells us to continue from - try: - serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apcontinue" ] - apfrom = DumpGenerator.fixHTMLEntities( serverapfrom ) - except: - try: - serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apfrom" ] - apfrom = DumpGenerator.fixHTMLEntities( serverapfrom ) - except: - # Reached the end of having to keep continuing, exit the while condition - apfrom = "" - pages = pagetitles[ "query" ][ "allpages" ] - # Add to namespace page count - pagecount += len( pages ) - for page in pages: - title = "%s\n" % ( page[ "title" ] ) - Output.appendToFile( self.titlesdumpfile, title ) - if ( pagecount == 1 ): - Output.message( " Got 1 page title in namespace %s" % ( namespace ) ) - else: - Output.message( " Got %d page titles in namespace %s" % ( pagecount, namespace ) ) - # Add to total page count - totalpagecount += pagecount - if ( totalpagecount == 1 ): - Output.message( "Got 1 page title in total." % ( totalpagecount ) ) - else: - Output.message( "Got %d page titles in total." % ( totalpagecount ) ) - - def getXMLPage(self, page, siteinfo=False): - """ - Get the XML of one page. - - Input: - - page: The title of the page to download. - - siteinfo: Whether to include the siteinfo header in the XML. - """ - parameters = { - "title": "Special:Export", - "pages": page, - "action": "submit" } - if ( DumpGenerator.curonly == True ): - parameters[ "curonly" ] = 1 - parameters[ "limit" ] = 1 - else: - # Make the wiki download the actual full history - parameters["history"] = "1" - # TODO: Can cause memory problems if the page has a huge history - result = RequestIndex.query( parameters ) - pagehash = hashlib.sha256( page ).hexdigest()[:8] - tempfile = "%s.xml.tmp" % ( pagehash ) - tempfile2 = "%s.xml" % ( pagehash ) - Output.appendToFile( tempfile, result ) - result = "" # Free up memory - # Warning: The following is NOT compatible with MediaWiki XML Schema Description version 0.3 and below! - # See http://wikiteam.googlecode.com/svn/trunk/schema/README.md for more information about MediaWiki versions - # this will affect and ways to overcome it. - if ( siteinfo == False ): - linecount = 0 - # The 11 comes from lines like , "special" namespaces and the very first line - # TODO: Hacky way of removing the siteinfo, check for backward compatibility! - linestoskip = 11 + self.lennamespaces - for line in open( tempfile, "r" ).read().splitlines(): - linecount += 1 - if linecount > linestoskip: - if ( "" in line ): - pass - else: - line = "%s\n" % ( line ) - Output.appendToFile( tempfile2, line ) - else: - continue - else: - for line in open( tempfile, "r" ).read().splitlines(): - if ( "" in line ): - pass - else: - line = "%s\n" % ( line ) - Output.appendToFile( tempfile2, line ) - os.remove( tempfile ) - return tempfile2 - - def integrityCheck(self, dumpfile): - """ - Checks the integrity of the XML dump and ensures that it is not corrupted. - """ - Output.message( "Checking the integrity of the XML dump..." ) - checktitles = 0 - checkpageopen = 0 - checkpageclose = 0 - checkrevisionopen = 0 - checkrevisionclose = 0 - # Check the number of instances of the following tags - # By logic they should be the same number - for line in file( dumpfile, "r" ).read().splitlines(): - if "" in line: - checktitles += 1 - elif "<page>" in line: - checkpageopen += 1 - elif "</page>" in line: - checkpageclose += 1 - elif "<revision>" in line: - checkrevisionopen += 1 - elif "</revision>" in line: - checkrevisionclose += 1 - else: - continue - - if ( checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose ): - Output.message( "Excellent, the XML dump is not corrupted." ) - else: - Output.warn( "WARNING: XML dump seems to be corrupted." ) - if ( DumpGenerator.autonomous == True ): - reply = "yes" - else: - reply = "" - while reply.lower() not in [ "yes", "y", "no", "n" ]: - reply = raw_input( 'Regenerate a new dump ([yes, y], [no, n])? ' ) - if reply.lower() in [ "yes", "y" ]: - self.dumpretrycount += 1 - if ( self.dumpretrycount < 3 ): - Output.warn( "Generating a new dump..." ) - os.remove( dumpfile ) - self.dumpXML() - else: - Output.warn( "We have tried dumping the wiki 3 times, but the dump is still corrupted. Not going to carry on since it is probably a problem on the wiki." ) - # Encourage the user to tell us about this faulty wiki - print "Please tell us about this by reporting an issue here: https://code.google.com/p/wikiteam/issues/list. Thank you!" - print "Giving you a little time to see this message..." - time.sleep(3) # Give time for the user to see the message - elif reply.lower() in [ "no", "n" ]: - Output.warn( "Not generating a new dump. Note: Your dump is corrupted and might not work with MediaWiki!" ) - - def run(self): - """ - Execute the process of producing an XML dump. - """ - if ( DumpGenerator.useAPI == True ): - if ( DumpGenerator.titlesonly == True ): - self.dumpPageTitlesAPI() - else: - if ( DumpGenerator.titles != "" ): - Output.message( "Using the list of page titles provided at %s." % ( DumpGenerator.titles ) ) - self.titlesdumpfile = DumpGenerator.titles - else: - self.dumpPageTitlesAPI() - self.dumpXML() - else: - if ( DumpGenerator.titlesonly == True ): - self.dumpPageTitlesIndex() - else: - if ( DumpGenerator.titles != "" ): - self.titlesdumpfile = DumpGenerator.titles - else: - self.dumpPageTitlesIndex() - self.dumpXML() - -class Output: - """ - The class to output anything to the user or to a place not within the script. - - For doing outputs to user: - This is used instead of directly using the "print" function is because - this is intended to log everything that is told to the user, so that it - is possible to check when and where things went wrong. - - For doing outputs to elsewhere: - This is to reduce memory usage by storing large chunks of data into disk - and reducing the risk of getting a MemoryError. - """ - def __init__(self): - self.logfile = "dumpgenerator.log" - - # Output to disk - def appendToFile(self, outputfile, contents): - """ - Output contents to file. - - Inputs: - - outputfile: The file to output to. - - contents: The content to add for each line. - """ - if ( os.path.exists( outputfile ) == False ): - open( outputfile, "a" ).close() # "touch" the file - else: - pass - thefile = open( outputfile, "a" ) - try: - contents = contents.encode( "utf-8", "ignore" ) - # TODO: During a test phase, this error kept coming up, though the final output was no different from - # what was produced using dumpBackup.php and using Special:Export itself. - except UnicodeDecodeError: - pass - thefile.write( contents ) - thefile.close() - - # Output to user - def error(self, message): - print message - print "Write --help for more information." - self.log( "An error occurred: %s" % (message) ) - - def log(self, message): - if ( DumpGenerator.nolog or DumpGenerator.debugmode): - # Skip logging - time.sleep(0) - else: - timestamp = datetime.datetime.fromtimestamp( time.time() ).strftime( "%Y-%m-%d %H:%M:%S" ) - logline = "%s: %s\n" % (timestamp, message) - self.appendToFile( self.logfile, logline ) - - def message(self, message): - print message - self.log( "Told the user: %s" % (message) ) - - def warn(self, message): - print message - self.log( "Warned the user: %s" % (message) ) - -class RequestAPI: - """ - The RequestAPI class, to submit APi request calls to the server. - """ - def __init__(self): - """ - The constructor function. - """ - - def query(self, params, url=""): - """ - The function to send an API call to the server given in the "url" - parameter using the parameters found in params. If url is empty, - DumpGenerator.urltoapi is used instead. - - Note: This function will assume action=query, other functions provides - the other query forms, but not this one. - - Input: - - params: Parameters to API call as an array (excluding action=query and format=json) - - Returns - - Result of API call in JSON format. - """ - if ( url == "" ): - url = DumpGenerator.urltoapi - else: - url = url - queryurl = "%s?action=query&format=json" % ( url ) - headers = { "User-Agent": DumpGenerator.UserAgent } - # Convert the array to a proper URL - paras = urllib.urlencode( params ) - # POST the parameters to the server - request = urllib2.Request( queryurl, paras, headers ) - try: - result = urllib2.urlopen( request ) - except: - try: - # Add a little delay between requests if server is slow - sleeptime = DumpGenerator.delay + 10 - Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) ) - time.sleep( sleeptime ) - result = urllib2.urlopen( request ) - except: - Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." ) - sys.exit(2) - output = result.read() - result.close() - return output - -class RequestIndex: - def __init__(self): - """ - The constructor function. - """ - - def query(self, params, url=""): - """ - The function to send an request to the server given in the "url" - parameter using the parameters found in params. If url is empty, - DumpGenerator.urltoindex is used instead. - - Input: - - params: Parameters to the request to send, appended to url as - a GET request. - - Returns - - Result of GET request. - """ - if ( url == "" ): - url = DumpGenerator.urltoindex - else: - url = url - headers = { "User-Agent": DumpGenerator.UserAgent } - paras = urllib.urlencode( params ) - # index.php does not support POST request, formulating a correct GET URL here - queryurl = "%s?%s" % ( url, paras ) - request = urllib2.Request( queryurl, headers=headers ) - # TODO: Make urlopen follow redirects - try: - result = urllib2.urlopen( request ) - except: - try: - # Add a little delay between requests if server is slow - sleeptime = DumpGenerator.delay + 10 - Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) ) - time.sleep( sleeptime ) - result = urllib2.urlopen( request ) - except: - Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." ) - sys.exit(2) - output = result.read() - result.close() - return output - - def removeIP(self, content): - """ - Remove the user's IP address while fetching HTML pages. - """ - # Remove IPv4 addresses - content = re.sub( r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", content ) - # Remove IPv6 addresses - content = re.sub( r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}", "0:0:0:0:0:0:0:0", content ) - return content - -class Updater: - """ - The class to auto-update the user's script to the latest version of DumpGenerator. - """ - # TODO: Get the script to check only occasionally, this is a performance concern - def __init__(self): - """ - The constructor function. - """ - self.controlUrl = "http://wikiteam.googlecode.com/svn/trunk/revnum.json" - self.controlUrl2 = "https://raw.github.com/dumps/DumpGenerator/master/revnum.json" - self.result = {} - - def checkRevision(self): - """ - Check the current revision and ensure that it is up-to-date. - """ - jsonresult = self.getRevisionJson() - if ( jsonresult == False ): - pass - else: - result = json.loads( jsonresult ) - self.result = result - if ( result[ "latest" ] == DumpGenerator.Version ): - if ( result[ "releases" ][ DumpGenerator.Version ][ "revision" ] == DumpGenerator.revision ): - pass - else: - self.update() - else: - self.update() - - def getRevisionJson(self): - """ - Download the controlling JSON file. - """ - headers = {'User-Agent': DumpGenerator.UserAgent} - skip = False - # TODO: Handle 404 errors - try: - revjson = urllib2.urlopen( urllib2.Request( self.controlUrl, headers=headers ) ) - except: - try: - revjson = urllib2.urlopen( urllib2.Request( self.controlUrl2, headers=headers ) ) - except: - Output.warn( "Unable to check if a new version of dumpgenerator.py is available, continuing..." ) - skip = True - if ( skip == False ): - output = revjson.read() - revjson.close() - return output - else: - return False - - def update(self): - """ - Update DumpGenerator.py to the current latest version - """ - currentfile = sys.argv[0] - latestver = self.result[ "latest" ] - latestrev = self.result[ "releases" ][ latestver ][ "revision" ] - latesturl = self.result[ "releases" ][ latestver ][ "downloadurl" ] - latesturl2 = self.result[ "releases" ][ latestver ][ "downloadurl2" ] - updated = True - # TODO: Handle 404 errors - try: - urllib.urlretrieve( latesturl, currentfile ) - except: - try: - urllib.urlretrieve( latesturl2, currentfile ) - except: - updated = False - if ( updated == False ): - Output.warn( "Unable to update DumpGenerator, skipping update for now..." ) - else: - Output.message( "DumpGenerator was updated to %s (revision %s)! Changes will take effect on next run." % ( latestver, latestrev ) ) - -if __name__ == "__main__": - # Class registry, for use throughout the whole script - RequestAPI = RequestAPI() - RequestIndex = RequestIndex() - DumpGenerator = DumpGenerator() - DumpImages = DumpImages() - DumpLogs = DumpLogs() - DumpXML = DumpXML() - Output = Output() - Updater = Updater() - - # Start everything up - DumpGenerator.run() From 3929e4eb9c55da11e4f589b6fa0cc17c8a550274 Mon Sep 17 00:00:00 2001 From: balr0g <balrog032@gmail.com> Date: Thu, 3 Jul 2014 14:23:21 -0400 Subject: [PATCH 2/2] Cleanups and error fixes suggested by flake8 (pep8 + pyflakes) --- dumpgenerator.py | 809 +++++++++++++++++++++++++++++------------------ 1 file changed, 495 insertions(+), 314 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index cd6a58e..774bac9 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -7,12 +7,12 @@ # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. @@ -40,11 +40,11 @@ try: except ImportError: print "Please install or update the Requests module." sys.exit(1) -import subprocess import time import urllib -__VERSION__ = '0.2.2' #major, minor, micro +__VERSION__ = '0.2.2' # major, minor, micro + def getVersion(): return(__VERSION__) @@ -54,23 +54,28 @@ def truncateFilename(other={}, filename=''): """ Truncate filenames when downloading images with large filenames """ return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1] + def delay(config={}, session=None): """ Add a delay if configured for that """ if config['delay'] > 0: print 'Sleeping... %d seconds...' % (config['delay']) time.sleep(config['delay']) + def cleanHTML(raw=''): """ Extract only the real wiki content and remove rubbish """ """ This function is ONLY used to retrieve page titles and file names when no API is available """ """ DO NOT use this function to extract page content """ - #different "tags" used by different MediaWiki versions to mark where starts and ends content + # different "tags" used by different MediaWiki versions to mark where + # starts and ends content if re.search('<!-- bodytext -->', raw): raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0] elif re.search('<!-- start content -->', raw): - raw = raw.split('<!-- start content -->')[1].split('<!-- end content -->')[0] + raw = raw.split( + '<!-- start content -->')[1].split('<!-- end content -->')[0] elif re.search('<!-- Begin Content Area -->', raw): - raw = raw.split('<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0] + raw = raw.split( + '<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0] elif re.search('<!-- content -->', raw): raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0] elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw): @@ -81,6 +86,7 @@ def cleanHTML(raw=''): sys.exit() return raw + def handleStatusCode(response): statuscode = response.status_code if statuscode >= 200 and statuscode < 300: @@ -113,58 +119,66 @@ def handleStatusCode(response): print response.url sys.exit(1) + def getNamespacesScraper(config={}, session=None): """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """ """ Function called if no API is available """ namespaces = config['namespaces'] - namespacenames = {0:''} # main is 0, no prefix + namespacenames = {0: ''} # main is 0, no prefix if namespaces: - r = session.post(url=config['index'], data={'title': 'Special:Allpages'}) + r = session.post( + url=config['index'], data={'title': 'Special:Allpages'}) raw = r.text delay(config=config, session=session) - m = re.compile(r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw) # [^>]*? to include selected="selected" + # [^>]*? to include selected="selected" + m = re.compile( + r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw) if 'all' in namespaces: namespaces = [] for i in m: namespaces.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") + namespacenames[int(i.group("namespaceid"))] = i.group( + "namespacename") else: - #check if those namespaces really exist in this wiki + # check if those namespaces really exist in this wiki namespaces2 = [] for i in m: if int(i.group("namespaceid")) in namespaces: namespaces2.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") + namespacenames[int(i.group("namespaceid"))] = i.group( + "namespacename") namespaces = namespaces2 else: namespaces = [0] - - namespaces = list(set(namespaces)) #uniques + + namespaces = list(set(namespaces)) # uniques print '%d namespaces found' % (len(namespaces)) return namespaces, namespacenames - + + def getNamespacesAPI(config={}, session=None): """ Uses the API to get the list of namespaces names and ids """ namespaces = config['namespaces'] - namespacenames = {0:''} # main is 0, no prefix + namespacenames = {0: ''} # main is 0, no prefix if namespaces: - r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}) + r = session.post(url=config['api'], data={ + 'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}) result = json.loads(r.text) delay(config=config, session=session) if 'all' in namespaces: namespaces = [] for i in result['query']['namespaces'].keys(): - if int(i) < 0: # -1: Special, -2: Media, excluding + if int(i) < 0: # -1: Special, -2: Media, excluding continue namespaces.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] else: - #check if those namespaces really exist in this wiki + # check if those namespaces really exist in this wiki namespaces2 = [] for i in result['query']['namespaces'].keys(): - if int(i) < 0: # -1: Special, -2: Media, excluding + if int(i) < 0: # -1: Special, -2: Media, excluding continue if int(i) in namespaces: namespaces2.append(int(i)) @@ -172,41 +186,46 @@ def getNamespacesAPI(config={}, session=None): namespaces = namespaces2 else: namespaces = [0] - - namespaces = list(set(namespaces)) #uniques + + namespaces = list(set(namespaces)) # uniques print '%d namespaces found' % (len(namespaces)) return namespaces, namespacenames + def getPageTitlesAPI(config={}, session=None): """ Uses the API to get the list of page titles """ titles = [] - namespaces, namespacenames = getNamespacesAPI(config=config, session=session) + namespaces, namespacenames = getNamespacesAPI( + config=config, session=session) for namespace in namespaces: if namespace in config['exnamespaces']: print ' Skipping namespace = %d' % (namespace) continue - + c = 0 print ' Retrieving titles in the namespace %d' % (namespace) apfrom = '!' while apfrom: - sys.stderr.write('.') #progress - params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} + sys.stderr.write('.') # progress + params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, + 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} r = session.post(url=config['api'], data=params) handleStatusCode(r) - #FIXME Handle HTTP errors here! + # FIXME Handle HTTP errors here! jsontitles = json.loads(r.text) apfrom = '' - if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'): - if jsontitles['query-continue']['allpages'].has_key('apcontinue'): - apfrom = jsontitles['query-continue']['allpages']['apcontinue'] - elif jsontitles['query-continue']['allpages'].has_key('apfrom'): + if 'query-continue' in jsontitles and 'allpages' in jsontitles['query-continue']: + if 'apcontinue' in jsontitles['query-continue']['allpages']: + apfrom = jsontitles['query-continue']['allpages']['apcontinue'] + elif 'apfrom' in jsontitles['query-continue']['allpages']: apfrom = jsontitles['query-continue']['allpages']['apfrom'] - #print apfrom - #print jsontitles - titles += [page['title'] for page in jsontitles['query']['allpages']] + # print apfrom + # print jsontitles + titles += [page['title'] + for page in jsontitles['query']['allpages']] if len(titles) != len(set(titles)): - #probably we are in a loop, server returning dupe titles, stop it + # probably we are in a loop, server returning dupe titles, stop + # it print 'Probably a loop, finishing' titles = list(set(titles)) apfrom = '' @@ -215,17 +234,20 @@ def getPageTitlesAPI(config={}, session=None): print ' %d titles retrieved in the namespace %d' % (c, namespace) return titles + def getPageTitlesScraper(config={}, session=None): """ """ titles = [] - namespaces, namespacenames = getNamespacesScraper(config=config, session=session) + namespaces, namespacenames = getNamespacesScraper( + config=config, session=session) for namespace in namespaces: print ' Retrieving titles in the namespace', namespace - url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) + url = '%s?title=Special:Allpages&namespace=%s' % ( + config['index'], namespace) r = session.get(url=url) raw = r.text raw = cleanHTML(raw) - + r_title = r'title="(?P<title>[^>]+)">' r_suballpages = '' r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' @@ -235,177 +257,212 @@ def getPageTitlesScraper(config={}, session=None): elif re.search(r_suballpages2, raw): r_suballpages = r_suballpages2 else: - pass #perhaps no subpages - - deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels + pass # perhaps no subpages + + # 3 is the current deep of English Wikipedia for Special:Allpages, 3 + # levels + deep = 3 c = 0 checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: - #load sub-Allpages + # load sub-Allpages m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') - + if r_suballpages == r_suballpages1: to = i.group('to') name = '%s-%s' % (fr, to) - url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (config['index'], namespace, fr, to) #do not put urllib.quote in fr or to - elif r_suballpages == r_suballpages2: #fix, esta regexp no carga bien todas? o falla el r_title en este tipo de subpag? (wikiindex) - fr = fr.split('&namespace=')[0] #clean &namespace=\d, sometimes happens + url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( + config['index'], namespace, fr, to) # do not put urllib.quote in fr or to + # fix, esta regexp no carga bien todas? o falla el r_title en + # este tipo de subpag? (wikiindex) + elif r_suballpages == r_suballpages2: + # clean &namespace=\d, sometimes happens + fr = fr.split('&namespace=')[0] name = fr - url = '%s?title=Special:Allpages/%s&namespace=%s' % (config['index'], name, namespace) - - if not name in checked_suballpages: - checked_suballpages.append(name) #to avoid reload dupe subpages links + url = '%s?title=Special:Allpages/%s&namespace=%s' % ( + config['index'], name, namespace) + + if name not in checked_suballpages: + # to avoid reload dupe subpages links + checked_suballpages.append(name) delay(config=config, session=session) r2 = session.get(url=url) raw2 = r2.text raw2 = cleanHTML(raw2) - rawacum += raw2 #merge it after removed junk + rawacum += raw2 # merge it after removed junk print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages' delay(config=config, session=session) c += 1 - + c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = undoHTMLEntities(text=i.group('title')) if not t.startswith('Special:'): - if not t in titles: + if t not in titles: titles.append(t) c += 1 print ' %d titles retrieved in the namespace %d' % (c, namespace) return titles + def getPageTitles(config={}, session=None): """ Get list of page titles """ - #http://en.wikipedia.org/wiki/Special:AllPages - #http://archiveteam.org/index.php?title=Special:AllPages - #http://www.wikanda.es/wiki/Especial:Todas + # http://en.wikipedia.org/wiki/Special:AllPages + # http://archiveteam.org/index.php?title=Special:AllPages + # http://www.wikanda.es/wiki/Especial:Todas print 'Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None') print 'Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None') - + titles = [] if config['api']: titles = getPageTitlesAPI(config=config, session=session) elif config['index']: titles = getPageTitlesScraper(config=config, session=session) - - titles = list(set(titles)) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace)) - titles.sort() #sorting - + + # removing dupes (e.g. in CZ appears Widget:AddThis two times (main + # namespace and widget namespace)) + titles = list(set(titles)) + titles.sort() # sorting + print '%d page titles loaded' % (len(titles)) return titles + def getXMLHeader(config={}, session=None): """ Retrieve a random page to extract XML headers (namespace info, etc) """ - #get the header of a random page, to attach it in the complete XML backup - #similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x.... - randomtitle = 'Main_Page' #previously AMF5LKE43MNFGHKSDMRTJ - xml = getXMLPage(config=config, title=randomtitle, verbose=False, session=session) + # get the header of a random page, to attach it in the complete XML backup + # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" + # xmlns:x.... + randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ + xml = getXMLPage( + config=config, title=randomtitle, verbose=False, session=session) header = xml.split('</mediawiki>')[0] if not xml: print 'XML export on this wiki is broken, quitting.' sys.exit() return header + def getXMLFileDesc(config={}, title='', session=None): """ Get XML for image description page """ - config['curonly'] = 1 #tricky to get only the most recent desc + config['curonly'] = 1 # tricky to get only the most recent desc return getXMLPage(config=config, title=title, verbose=False, session=session) + def getUserAgent(): """ Return a cool user-agent to hide Python user-agent """ useragents = [ - #firefox - 'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0', + # firefox + 'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0', - ] + ] return useragents[0] + def logerror(config={}, text=''): """ Log error in file """ if text: with open('%s/errors.log' % (config['path']), 'a') as outfile: - output = u'%s: %s\n' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text) + output = u'%s: %s\n' % ( + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text) outfile.write(output.encode('utf-8')) + def getXMLPageCore(headers={}, params={}, config={}, session=None): """ """ - #returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki> - #if retrieving params['limit'] revisions fails, returns a current only version - #if all fail, returns the empty string + # returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki> + # if retrieving params['limit'] revisions fails, returns a current only version + # if all fail, returns the empty string xml = '' c = 0 - maxseconds = 100 #max seconds to wait in a single sleeping - maxretries = 5 # x retries and skip - increment = 20 #increment every retry + maxseconds = 100 # max seconds to wait in a single sleeping + maxretries = 5 # x retries and skip + increment = 20 # increment every retry while not re.search(r'</mediawiki>', xml): if c > 0 and c < maxretries: - wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds + wait = increment * c < maxseconds and increment * \ + c or maxseconds # incremental until maxseconds print ' XML for "%s" is wrong. Waiting %d seconds and reloading...' % (params['pages'], wait) time.sleep(wait) - if params['limit'] > 1: # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function) - params['limit'] = params['limit'] / 2 # half + # reducing server load requesting smallest chunks (if curonly then + # limit = 1 from mother function) + if params['limit'] > 1: + params['limit'] = params['limit'] / 2 # half if c >= maxretries: print ' We have retried %d times' % (c) print ' MediaWiki error for "%s", network error or whatever...' % (params['pages']) # If it's not already what we tried: our last chance, preserve only the last revision... # config['curonly'] means that the whole dump is configured to save nonly the last - # params['curonly'] should mean that we've already tried this fallback, because it's set by the following if and passed to getXMLPageCore - if not config['curonly']: + # params['curonly'] should mean that we've already tried this + # fallback, because it's set by the following if and passed to + # getXMLPageCore + if not config['curonly']: print ' Trying to save only the last revision for this page...' params['curonly'] = 1 - logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (params['pages'])) + logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % ( + params['pages'])) return getXMLPageCore(headers=headers, params=params, config=config) else: print ' Saving in the errors log, and skipping...' - logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (params['pages'])) - return '' # empty xml - #FIXME HANDLE HTTP Errors HERE + logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % ( + params['pages'])) + return '' # empty xml + # FIXME HANDLE HTTP Errors HERE r = session.post(url=config['index'], data=params, headers=headers) handleStatusCode(r) xml = r.text c += 1 - + return xml + def getXMLPage(config={}, title='', verbose=True, session=None): """ Get the full history (or current only) of a page """ - #if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated - #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F - + # if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated + # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F + limit = 1000 truncated = False title_ = title title_ = re.sub(' ', '_', title_) - #do not convert & into %26, title_ = re.sub('&', '%26', title_) + # do not convert & into %26, title_ = re.sub('&', '%26', title_) params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'} if config['curonly']: params['curonly'] = 1 params['limit'] = 1 else: - params['offset'] = '1' # 1 always < 2000s + params['offset'] = '1' # 1 always < 2000s params['limit'] = limit - if config.has_key('templates') and config['templates']: #in other case, do not set params['templates'] + # in other case, do not set params['templates'] + if 'templates' in config and config['templates']: params['templates'] = 1 - + xml = getXMLPageCore(params=params, config=config, session=session) - #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available - #else, warning about Special:Export truncating large page histories + # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available + # else, warning about Special:Export truncating large page histories r_timestamp = r'<timestamp>([^<]+)</timestamp>' - if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one - while not truncated and params['offset']: #next chunk - params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML - xml2 = getXMLPageCore(params=params, config=config, session=session) - - if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>? + # search for timestamps in xml to avoid analysing empty pages like + # Special:Allpages and the random one + if not config['curonly'] and re.search(r_timestamp, xml): + while not truncated and params['offset']: # next chunk + # get the last timestamp from the acum XML + params['offset'] = re.findall(r_timestamp, xml)[-1] + xml2 = getXMLPageCore( + params=params, config=config, session=session) + + # are there more edits in this next XML chunk or no <page></page>? + if re.findall(r_timestamp, xml2): if re.findall(r_timestamp, xml2)[-1] == params['offset']: - #again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000) + # again the same XML, this wiki does not support params in + # Special:Export, offer complete XML up to X edits (usually + # 1000) print 'ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated' truncated = True break @@ -421,47 +478,53 @@ def getXMLPage(config={}, title='', verbose=True, session=None): <timestamp>2011-03-09T19:57:06Z</timestamp> <contributor> """ - #offset is OK in this wiki, merge with the previous chunk of this page history and continue - xml = xml.split('</page>')[0] + ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:])) + # offset is OK in this wiki, merge with the previous chunk + # of this page history and continue + xml = xml.split( + '</page>')[0] + ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:])) else: - params['offset'] = '' #no more edits in this page history - + params['offset'] = '' # no more edits in this page history + if verbose: numberofedits = len(re.findall(r_timestamp, xml)) if (numberofedits == 1): print ' %s, 1 edit' % (title) else: print ' %s, %d edits' % (title, numberofedits) - + return xml + def cleanXML(xml=''): """ Trim redundant info """ - #do not touch XML codification, leave AS IS + # do not touch XML codification, leave AS IS if re.search(r'</siteinfo>\n', xml) and re.search(r'</mediawiki>', xml): xml = xml.split('</siteinfo>\n')[1] xml = xml.split('</mediawiki>')[0] return xml + def generateXMLDump(config={}, titles=[], start='', session=None): """ Generates a XML dump for a list of titles """ - + print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') header = getXMLHeader(config=config, session=session) - footer = '</mediawiki>\n' #new line at the end - xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history') + footer = '</mediawiki>\n' # new line at the end + xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), + config['date'], config['curonly'] and 'current' or 'history') xmlfile = '' lock = True if start: - #remove the last chunk of xml dump (it is probably incomplete) + # remove the last chunk of xml dump (it is probably incomplete) xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r') xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w') prev = '' c = 0 for l in xmlfile: - #removing <page>\n until end of file - if c != 0: #lock to avoid write an empty line at the begining of file - if not re.search(r'<title>%s' % (start), l): + # removing \n until end of file + # lock to avoid write an empty line at the begining of file + if c != 0: + if not re.search(r'%s' % (start), l): xmlfile2.write(prev) else: break @@ -469,22 +532,25 @@ def generateXMLDump(config={}, titles=[], start='', session=None): prev = l xmlfile.close() xmlfile2.close() - #subst xml with xml2 - os.remove('%s/%s' % (config['path'], xmlfilename)) #remove previous xml dump - os.rename('%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) #move correctly truncated dump to its real name + # subst xml with xml2 + # remove previous xml dump + os.remove('%s/%s' % (config['path'], xmlfilename)) + # move correctly truncated dump to its real name + os.rename( + '%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) else: - #requested complete xml dump + # requested complete xml dump lock = False xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') xmlfile.write(header.encode('utf-8')) xmlfile.close() - + xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') c = 1 for title in titles: if not title.strip(): continue - if title == start: #start downloading from start, included + if title == start: # start downloading from start, included lock = False if lock: continue @@ -494,78 +560,111 @@ def generateXMLDump(config={}, titles=[], start='', session=None): xml = getXMLPage(config=config, title=title, session=session) xml = cleanXML(xml=xml) if not xml: - logerror(config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title)) - #here, XML is a correct chunk or - #an empty string due to a deleted page (logged in errors log) or - #an empty string due to an error while retrieving the page from server (logged in errors log) + logerror( + config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title)) + # here, XML is a correct chunk or + # an empty string due to a deleted page (logged in errors log) or + # an empty string due to an error while retrieving the page from server + # (logged in errors log) xmlfile.write(xml.encode('utf-8')) c += 1 xmlfile.write(footer) xmlfile.close() print 'XML dump saved at...', xmlfilename + def saveTitles(config={}, titles=[]): """ Save title list in a file """ - titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date']) + titlesfilename = '%s-%s-titles.txt' % ( + domain2prefix(config=config), config['date']) titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w') output = u"%s\n--END--" % ('\n'.join(titles)) titlesfile.write(output.encode('utf-8')) titlesfile.close() - + print 'Titles saved at...', titlesfilename + def saveImageFilenamesURL(config={}, images=[], session=None): """ Save image list in a file, including filename, url and uploader """ - imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date']) + imagesfilename = '%s-%s-images.txt' % ( + domain2prefix(config=config), config['date']) imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') - imagesfile.write(('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]).encode('utf-8'))) + imagesfile.write(('\n'.join(['%s\t%s\t%s' % ( + filename, url, uploader) for filename, url, uploader in images]).encode('utf-8'))) imagesfile.write('\n--END--') imagesfile.close() - + print 'Image filenames and URLs saved at...', imagesfilename + def getImageFilenamesURL(config={}, session=None): """ Retrieve file list: filename, url, uploader """ - + print 'Retrieving image filenames' - r_next = r'(?\d+)&' # (?\d+)&' images = [] - offset = '29990101000000' #january 1, 2999 + offset = '29990101000000' # january 1, 2999 limit = 5000 retries = 5 while offset: - #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset}) + # 5000 overload some servers, but it is needed for sites like this with + # no next links + # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= + r = session.post(url=config['index'], data={ + 'title': 'Special:Imagelist', 'limit': limit, 'offset': offset}) raw = r.text delay(config=config, session=session) - if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki + # delicate wiki + if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if limit > 10: print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit) - limit = limit/10 + limit = limit / 10 continue - elif retries > 0: # waste retries, then exit + elif retries > 0: # waste retries, then exit retries -= 1 print 'Retrying...' continue else: print 'No more retries, exit...' break - + raw = cleanHTML(raw) - #archiveteam 1.15.1 Yahoovideo.jpg (file) - #wikanda 1.15.5 Fernandocg + # archiveteam 1.15.1 Yahoovideo.jpg (file) + # wikanda 1.15.5 Fernandocg r_images1 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' - #wikijuegos 1.9.5 http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old mediawiki version + # wikijuegos 1.9.5 + # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old + # mediawiki version r_images2 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' - #gentoowiki 1.18 18:15, 3 April 2011Asus eeepc-1201nl.png (file)37 KBYannails 1 + # gentoowiki 1.18 18:15, 3 + # April 2011Asus eeepc-1201nl.png (file)37 KBYannails 1 r_images3 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' - #http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - #(desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
+ # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= + # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
r_images4 = r'(?im)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' m = [] - #different mediawiki versions + # different mediawiki versions if re.search(r_images1, raw): m = re.compile(r_images1).finditer(raw) elif re.search(r_images2, raw): @@ -574,16 +673,22 @@ def getImageFilenamesURL(config={}, session=None): m = re.compile(r_images3).finditer(raw) elif re.search(r_images4, raw): m = re.compile(r_images4).finditer(raw) - + for i in m: url = i.group('url') - if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL? - if url[0] == '/': #slash is added later + # is it a relative URL? + if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): + if url[0] == '/': # slash is added later url = url[1:] - domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain - url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url + # remove from :// (http or https) until the first / after + # domain + domainalone = config['index'].split('://')[1].split('/')[0] + # concat http(s) + domain + relative url + url = u'%s://%s/%s' % (config['index'].split('://') + [0], domainalone, url) url = undoHTMLEntities(text=url) - #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars + # url = urllib.unquote(url) #do not use unquote with url, it break + # some urls with odd chars url = re.sub(' ', '_', url) filename = re.sub('_', ' ', i.group('filename')) filename = undoHTMLEntities(text=filename) @@ -592,54 +697,63 @@ def getImageFilenamesURL(config={}, session=None): uploader = undoHTMLEntities(text=uploader) uploader = urllib.unquote(uploader) images.append([filename, url, uploader]) - #print filename, url - + # print filename, url + if re.search(r_next, raw): offset = re.findall(r_next, raw)[0] - retries += 5 # add more retries if we got a page with offset + retries += 5 # add more retries if we got a page with offset else: offset = '' - + if (len(images) == 1): print ' Found 1 image' else: print ' Found %d images' % (len(images)) - + images.sort() return images + def getImageFilenamesURLAPI(config={}, session=None): """ Retrieve file list: filename, url, uploader """ - + print 'Retrieving image filenames' aifrom = '!' images = [] while aifrom: - sys.stderr.write('.') #progress - params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} - #FIXME Handle HTTP Errors HERE + sys.stderr.write('.') # progress + params = {'action': 'query', 'list': 'allimages', 'aiprop': + 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} + # FIXME Handle HTTP Errors HERE r = session.post(url=config['api'], data=params) handleStatusCode(r) jsonimages = json.loads(r.text) delay(config=config, session=session) aifrom = '' - if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'): - if jsonimages['query-continue']['allimages'].has_key('aicontinue'): - aifrom = jsonimages['query-continue']['allimages']['aicontinue'] - elif jsonimages['query-continue']['allimages'].has_key('aifrom'): + if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']: + if 'aicontinue' in jsonimages['query-continue']['allimages']: + aifrom = jsonimages['query-continue']['allimages']['aicontinue'] + elif 'aifrom' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages']['aifrom'] - #print aifrom - + # print aifrom + for image in jsonimages['query']['allimages']: url = image['url'] - if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL? - if url[0] == '/': #slash is added later + # is it a relative URL? + if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): + if url[0] == '/': # slash is added later url = url[1:] - domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain - url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url + # remove from :// (http or https) until the first / after + # domain + domainalone = config['index'].split('://')[1].split('/')[0] + # concat http(s) + domain + relative url + url = u'%s://%s/%s' % (config['index'].split('://') + [0], domainalone, url) url = re.sub(' ', '_', url) - # encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136 - filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8') + # encoding to ascii is needed to work around this horrible bug: + # http://bugs.python.org/issue8136 + filename = unicode(urllib.unquote( + (re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')), 'utf-8') uploader = re.sub('_', ' ', image['user']) images.append([filename, url, uploader]) @@ -651,40 +765,45 @@ def getImageFilenamesURLAPI(config={}, session=None): images.sort() return images + def undoHTMLEntities(text=''): """ Undo some HTML codes """ - - text = re.sub('<', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp + + # i guess only < > & " ' need conversion + # http://www.w3schools.com/html/html_entities.asp + text = re.sub('<', '<', text) text = re.sub('>', '>', text) text = re.sub('&', '&', text) text = re.sub('"', '"', text) text = re.sub(''', '\'', text) - + return text + def generateImageDump(config={}, other={}, images=[], start='', session=None): """ Save files and descriptions using a file list """ - - #fix use subdirectories md5 + + # fix use subdirectories md5 print 'Retrieving images from "%s"' % (start and start or 'start') imagepath = '%s/images' % (config['path']) if not os.path.isdir(imagepath): print 'Creating "%s" directory' % (imagepath) os.makedirs(imagepath) - + c = 0 lock = True if not start: lock = False for filename, url, uploader in images: - if filename == start: #start downloading from start (included) + if filename == start: # start downloading from start (included) lock = False if lock: continue delay(config=config, session=session) - - #saving file - #truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max) + + # saving file + # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash + # limit). Later .desc is added to filename, so better 100 as max) filename2 = urllib.unquote(filename) if len(filename2) > other['filenamelimit']: # split last . (extension) and then merge @@ -695,11 +814,13 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): r = requests.get(url=url) imagefile.write(r.content) imagefile.close() - #saving description if any - xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility + # saving description if any + xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % ( + filename), session=session) # use Image: for backwards compatibility f = open('%s/%s.desc' % (imagepath, filename2), 'w') - if not re.search(r'', xmlfiledesc): #Banner featuring SG1, SGA, SGU teams - #failure when retrieving desc? then save it as empty .desc + # Banner featuring SG1, SGA, SGU teams + if not re.search(r'', xmlfiledesc): + # failure when retrieving desc? then save it as empty .desc xmlfiledesc = '' f.write(xmlfiledesc.encode('utf-8')) f.close() @@ -707,12 +828,13 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): c += 1 if c % 10 == 0: print ' Downloaded %d images' % (c) - + print 'Downloaded %d images' % (c) - + + def saveLogs(config={}, session=None): """ Save Special:Log """ - #get all logs from Special:Log + # get all logs from Special:Log """parse