From 421df6d4bb3d8bdde9a864edbd0dbb709ebdce33 Mon Sep 17 00:00:00 2001
From: Hydriz Scholz <admin@alphacorp.tk>
Date: Fri, 4 Jul 2014 22:16:13 +0800
Subject: [PATCH 1/2] Spliting rewrite files into a separate branch

---
 rewrite/README.md        |    4 -
 rewrite/dumpgenerator.py | 1292 --------------------------------------
 2 files changed, 1296 deletions(-)
 delete mode 100644 rewrite/README.md
 delete mode 100644 rewrite/dumpgenerator.py

diff --git a/rewrite/README.md b/rewrite/README.md
deleted file mode 100644
index ff4a469..0000000
--- a/rewrite/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-## WikiTeam dumpgenerator.py rewrite
-This is the rewrite of WikiTeam's dumpgenerator.py. It is aimed towards getting native API support when downloading wikis and to avoid the use of screen scraping when doing so (which is quite hacky and not ideal).
-
-Note: THIS IS NOT A RELEASE YET, patches welcome.
diff --git a/rewrite/dumpgenerator.py b/rewrite/dumpgenerator.py
deleted file mode 100644
index ef3ba66..0000000
--- a/rewrite/dumpgenerator.py
+++ /dev/null
@@ -1,1292 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright (C) 2013 Hydriz Scholz
-# Copyright (C) 2014 WikiTeam
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program. If not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit
-# <http://www.gnu.org/copyleft/gpl.html>
-
-#######################################################################
-# dumpgenerator.py is a script to generate backups of MediaWiki wikis #
-# To learn more, read the documentation:							  #
-#		http://code.google.com/p/wikiteam/wiki/NewTutorial 			  #
-#######################################################################
-
-# For developers:
-# * All functions and classes are displayed in alphabetical order for easier accessibility.
-# * Script exit codes reference:
-#  * 0 - Script ran well without problems
-#  * 1 - Script failed due to user's incorrect use
-#  * 2 - Script failed due to destination server issue
-# * For testing purposes, add the --debug parameter and edit DumpGenerator.debug() accordingly.
-
-######
-# TODO LIST
-# 0. Download index.html and Special:Version.html
-# 1. Index.php support.
-# 2. Special:Log pages support
-# 3. GUI (Question and Answer if no parameters are given)
-# 4. Resuming of dump
-# 5. Place the images in various folders so as to avoid hitting the limit of number of files in a directory
-# 6. Speed up the script. A run with --xml --images on test.wikidata.org came up with 9 min 23 sec on 2.0 and 3 min 58 sec on 1.0
-
-# WHAT IS WORKING
-# 1. XML dumping
-# 2. Complete dumping using API (except for --logs)
-# 3. Automatic updating
-# 4. Dumping of XML based on a list of titles
-# 5. Integrity check for XML dump
-
-import datetime
-import getopt
-import hashlib
-import json
-import os
-import re
-import shutil
-import sys
-import time
-import urllib
-import urllib2
-import xml.etree.ElementTree as ElementTree
-
-class DumpGenerator:
-	"""
-	The main class that powers and operates everything else
-	"""
-	def __init__(self):
-		"""
-		Main constructor class for DumpGenerator, registers important variables too.
-		"""
-		self.Version = "2.0"
-		self.revision = "1"
-		# Provide a cool user-agent to hide the fact that this is a script
-		self.UserAgent = "Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0"
-		self.useAPI = False
-		self.useIndex = False
-		self.prefix = ""
-		self.domain = ""
-		self.tasklist = []
-		self.configfile = "config.json"
-		self.configoptions = {
-			"date": "",
-			"useAPI": False,
-			"useIndex": False,
-			"urltoapi": "",
-			"urltoindex": "",
-			"images": False,
-			"logs": False,
-			"xml": False,
-			"curonly": False,
-			"exnamespaces": "",
-			"titlesonly": False
-		}
-
-		# Basic metadata
-		self.date = datetime.datetime.now().strftime('%Y%m%d')
-
-		# Important URLs
-		self.urltoapi = ""
-		self.urltoindex = ""
-
-		# Type of dump to generate
-		self.images = False
-		self.logs = False
-		self.xml = False
-
-		# Resuming of previous dump
-		self.resume = False
-		self.path = ""
-
-		# Additional information for XML
-		self.curonly = False
-		self.exnamespaces = ""
-		self.titlesonly = False
-		self.titles = ""
-
-		# Others
-		self.cookies = ""
-		self.delay = 0
-		self.debugmode = False
-		self.nolog = False
-		self.autonomous = False
-
-		# Short options: string (no commas), long options: array
-		# More information about these options are at self.help()
-		self.shortoptions = "hv"
-		self.longoptions = [ "help", "api=", "index=", "curonly", "images", "logs", "xml", "auto", "delay=", "cookies=", "exnamespaces=", "resume", "path=", "debug", "nolog", "titlesonly", "titles=" ]
-
-	def bye(self):
-		"""
-		Bid farewell to the user at the very end of the script when everything 
-		has been successful.
-
-		Returns: Goodbye message.
-		"""
-		message = """---> Congratulations! Your dump is complete <---
-If you have suggestions, file a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list
-If this is a public wiki, do consider publishing this dump so others can benefit from it. Follow the steps as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam.
-Thank you for using DumpGenerator %s by WikiTeam, good bye!""" % ( self.Version )
-		return message
-
-	def checkAPI(self):
-		"""
-		Checks the validity of the api.php.
-		"""
-		query = {
-			"meta": "siteinfo",
-			"siprop": "general" }
-		sitestats = json.loads( RequestAPI.query( query ) )
-		try:
-			if ( sitestats[ "query" ][ "general" ][ "server" ] in self.urltoapi ):
-				return True
-		except:
-			try:
-				if ( sitestats[ "error" ][ "code" ] == "readapidenied" ) and ( self.cookies == "" ):
-					Output.warn( "The wiki is private and we do not have proper authentication information!" )
-					return False
-			except:
-				Output.warn( "This api.php seems weird or is not valid." )
-				return False
-
-	def checkIndex(self):
-		"""
-		Checks the validity of the index.php.
-		"""
-		# TODO: Screen scraping is involved here, need backward compact for older version of MediaWiki.
-		parameters = { "title": "Special:Version" }
-		request = RequestIndex.query( parameters )
-		# Since we are at Special:Version, we should not be getting Special:BadTitle unless we are not logged in
-		if ( re.search( r'(Special:Badtitle</a>)', request ) ) and ( self.cookies == "" ):
-			Output.error( "The wiki is private and we do not have proper authentication information!" )
-			sys.exit(1)
-
-		# Check for some tags within the Special:Version page, must be language-independent
-		if ( re.search( r'(<h2 id="mw-version-license">|meta name="generator" content="MediaWiki)', request ) ):
-			return True
-
-	def debug(self):
-		"""
-		A temporary debug mode for testing purposes.
-		REMOVE WHEN COMPLETE!
-		"""
-		print "DEBUG MODE ON"
-		print "Date: %s" % (self.date)
-		print "URL to api.php: %s" % (self.urltoapi)
-		print "URL to index.php: %s" % (self.urltoindex)
-		print "Current revision only: %s" % (self.curonly)
-		print "Image dump: %s" % (self.images)
-		print "Log dump: %s" % (self.logs)
-		print "XML dump: %s" % (self.xml)
-		print "Resume: %s" % (self.resume)
-		print "Path for resuming: %s" % (self.path)
-		print "Delay: %s" % (self.delay)
-		print "Cookies file: %s" % (self.cookies)
-		print "Excluded namespaces: %s" % (self.exnamespaces)
-		print "Debug mode on: %s" % (self.debugmode)
-		self.tasklist = sorted( self.tasklist )
-		for task in self.tasklist:
-			if ( task == "axml" ):
-				DumpXML.run()
-			elif ( task == "bimages" ):
-				DumpImages.run()
-			elif ( task == "clogs" ):
-				DumpLogs.run()
-		sys.exit(0)
-
-	def downloadHtmlPages(self):
-		"""
-		Downloads the HTML pages such as the main page and Special:Version.
-		"""
-		# Download the main page
-		Output.message( "Downloading index.php (Main Page) as index.html." )
-		query = {}
-		index = RequestIndex.query( query )
-		index = RequestIndex.removeIP( index )
-		if ( os.path.exists( "Special:Version.html" ) ):
-			os.remove( "index.html" )
-		else:
-			pass
-		for line in index:
-			Output.appendToFile( "index.html", line )
-
-		# Download Special:Version or its respective localized version
-		Output.message( "Downloading Special:Version with extensions and other related info." )
-		query = { "title": "Special:Version" }
-		SpecialVersion = RequestIndex.query( query )
-		SpecialVersion = RequestIndex.removeIP( SpecialVersion )
-		if ( os.path.exists( "Special:Version.html" ) ):
-			os.remove( "Special:Version.html" )
-		else:
-			pass
-		for line in SpecialVersion:
-			Output.appendToFile( "Special:Version.html", line )
-
-	def fixHTMLEntities(self, text):
-		"""
-		Convert some HTML entities to their regular characters.
-		"""
-		text = re.sub('&lt;', '<', text)
-		text = re.sub('&gt;', '>', text)
-		text = re.sub('&amp;', '&', text)
-		text = re.sub('&quot;', '"', text)
-		text = re.sub('&#039;', '\'', text)
-		return text
-
-	def help(self):
-		"""
-		Provides vital help information to the user. This function 
-		directly uses the "print" function because it is harmless and 
-		what needs to be logged has already been done so.
-
-		Returns: Help message text
-		"""
-		message = """DumpGenerator %s, a script to generate backups of MediaWiki wikis.
-For more information, please see: http://code.google.com/p/wikiteam/wiki/NewTutorial
-
-Startup:
-  -h, --help         Displays this help information and exits.
-  -v, --version	     Displays the version of this script, with additional credits.
-
-Wiki information:
-  --api=URL          The URL to the wiki's api.php, not to be used with --index.
-  --index=URL        The URL to the wiki's index.php, not to be used with --api.
-
-Options:
-  --xml	             Creates an XML dump.
-  --images           Creates an image dump.
-  --logs             Creates a dump of all log pages (not yet supported).
-
-XML dump (only if --xml is used):
-  --curonly          Download only the current revision.
-  --exnamespaces     The unique system number(s) for namespaces to exclude, separated by commas.
-  --titlesonly       Download only the page titles without the actual content.
-  --titles           Path to a file containing list of titles, requires "--END--" to be on the last line.
-
-Other:
-  --auto             Enable auto pilot mode (select options that ensures that the script creates a new dump).
-  --resume           Resume an incomplete dump (requires --path to be given).
-  --path=PATH        Path to the incomplete dump.
-  --delay=SECONDS    Adds a delay (in seconds) between requests.
-  --cookies=PATH     Path to a Mozilla cookies.txt file for authentication cookies.
-  --nolog            Disable logging to dumpgenerator.log (does not affect output in terminal).
-
-Report any issues to our issue tracker: https://code.google.com/p/wikiteam.""" % (self.Version)
-		return message
-
-	def loadConfig(self):
-		"""
-		Load a config file from a partially-made dump.
-		"""
-		config = json.loads( self.configfile )
-		self.date = config[ "date" ]
-		self.useAPI = config[ "useAPI" ]
-		self.useIndex = config[ "useIndex" ]
-		self.urltoapi = config[ "urltoapi" ]
-		self.urltoindex = config[ "urltoindex" ]
-		self.images = config[ "images" ]
-		self.logs = config[ "logs" ]
-		self.xml = config[ "xml" ]
-		self.curonly = config[ "curonly" ]
-		self.exnamespaces = config[ "exnamespaces" ]
-		self.titlesonly = config[ "titlesonly" ]
-
-		if ( self.images == True ):
-			self.tasklist.append( "bimage" )
-		if ( self.logs == True ):
-			self.tasklist.append( "clogs" )
-		if ( self.xml == True ):
-			self.tasklist.append( "axml" )
-
-		if ( self.useAPI == True ):
-			domain = self.urltoapi
-		elif ( self.useIndex == True ):
-			domain = self.urltoindex
-
-	def makePrefix(self, domain):
-		"""
-		Converts a domain to a prefix.
-
-		Inputs:
-		 - domain: The domain to change, may contain api.php or index.php as suffix.
-
-		Returns:
-		 - string with slashes and stray characters changed to underscores, suffix 
-		   removed and URL protocol removed.
-		"""
-		domain = domain.lower()
-		# Remove unnecessary prefixes and suffixes
-		domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
-		# Substitute directory slashes with underscores
-		domain = re.sub(r'/', '_', domain)
-		# Convert any stray character that is not in the alphabet to underscores
-		domain = re.sub(r'[^-.A-Za-z0-9]', '_', domain)
-		return domain
-
-	def makeNiceURL(self, domain):
-		"""
-		Converts a domain to a more human-readable format (used for uploading).
-
-		Inputs:
-		 - domain: The domain to change, may contain api.php or index.php as suffix.
-
-		Returns:
-		 - string with suffix removed.
-		"""
-		domain = domain.lower()
-		# Remove the suffixes
-		domain = re.sub(r'(/index\.php|/api\.php)', '', domain)
-		return domain
-
-	def processargs(self):
-		"""
-		Processing arguments and options provided by the user.
-		"""
-		try:
-			options, answers = getopt.getopt( sys.argv[1:], self.shortoptions, self.longoptions )
-		except getopt.GetoptError:
-			Output.error( "An unknown option has been specified, please check your arguments before re-running!" )
-			sys.exit(1)
-
-		# First accept all arguments and store them in a variable
-		for option, answer in options:
-			# Startup
-			if ( option in ( "-h", "--help" ) ):
-				# Display the help guide and exit
-				print self.help()
-				os.remove( Output.logfile )
-				sys.exit(0)
-			elif ( option in ( "-v", "--version" ) ):
-				# Display the version of this script
-				print self.version()
-				os.remove( Output.logfile )
-				sys.exit(0)
-
-			# Wiki information
-			elif ( option in "--api" ):
-				self.urltoapi = answer
-				self.configoptions[ "urltoapi" ] = self.urltoapi
-			elif ( option in "--index" ):
-				self.urltoindex = answer
-				self.configoptions[ "urltoindex" ] = self.urltoindex
-
-			# Dump options
-			elif ( option == "--images" ):
-				self.images = True
-				self.configoptions[ "images" ] = True
-				self.tasklist.append( "bimages" )
-			elif ( option == "--logs" ):
-				self.logs = True
-				self.configoptions[ "logs" ] = True
-				self.tasklist.append( "clogs" )
-			elif ( option == "--xml" ):
-				self.xml = True
-				self.configoptions[ "xml" ] = True
-				self.tasklist.append( "axml" )
-
-			# XML dump options
-			elif ( option == "--curonly" ):
-				self.curonly = True
-				self.configoptions[ "curonly" ] = True
-			elif ( option in "--exnamespaces" ):
-				self.exnamespaces = answer
-				self.configoptions[ "exnamespaces" ] = self.exnamespaces
-			elif ( option == "--titlesonly" ):
-				self.titlesonly = True
-				self.configoptions[ "titlesonly" ] = True
-			elif ( option in "--titles" ):
-				self.titles = os.path.abspath( answer )
-
-			# Other options
-			elif ( option == "--auto" ):
-				self.autonomous = True
-			elif ( option in "--cookies" ):
-				self.cookies = answer
-			elif ( option in "--delay" ):
-				self.delay = answer
-			elif ( option == "--nolog" ):
-				self.nolog = True
-			elif ( option in "--path" ):
-				self.path = answer
-			elif ( option == "--resume" ):
-				self.resume = True
-
-			# Private options (i.e. usable but not documented in --help)
-			elif ( option == "--debug" ):
-				self.debugmode = True
-			else:
-				Output.error( "An unknown option has been specified, please check your arguments before re-running!" )
-				sys.exit(1)
-
-		# Now to verify that the user is not messing around
-		if ( self.urltoapi == "" and self.urltoindex == "" ):
-			# User did not specify either --api= or --index=
-			if ( self.resume == True and self.path != "" ):
-				# ...but specified --resume and --path= accordingly
-				self.resumeDump()
-			elif ( self.resume == True and self.path == "" ):
-				# ...and specified --resume without --path=
-				Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" )
-				sys.exit(1)
-			else:
-				Output.error( "You need to tell me the URL to either the api.php or to index.php!" )
-				sys.exit(1)
-		elif ( self.resume == True ) and ( self.path == "" ):
-			# User specified --resume, but no --path= was given
-			Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" )
-			sys.exit(1)
-		elif ( self.urltoapi != "" and self.urltoindex != "" ):
-			# User specified both --api= and --index=
-			self.useAPI = True
-		elif ( self.xml == False and ( self.curonly == True or self.exnamespaces != "" ) ):
-			# User specified --curonly and --exnamespaces without --xml
-			Output.error( "You did not specify to make an XML dump using --xml, so why write --curonly or --exnamespaces? Remove them before re-running!" )
-			sys.exit(1)
-
-		if ( self.urltoapi != "" ):
-			self.useAPI = True
-		elif ( self.urltoindex != "" ):
-			self.useIndex = True
-
-		if ( self.useAPI == True ):
-			Output.message( "Checking api.php..." )
-			if not ( self.urltoapi.startswith( "http://" ) ) and not ( self.urltoapi.startswith( "https://" ) ):
-				Output.error( "The URL to api.php must start with either http:// or https://!" )
-				sys.exit(1)
-			elif ( self.checkAPI() ):
-				Output.message( "api.php is okay" )
-			else:
-				Output.error( "There is an error with api.php, please provide a correct path to it." )
-				sys.exit(1)
-		elif ( self.useIndex == True ):
-			Output.message( "Checking index.php..." )
-			if not ( self.urltoindex.startswith( "http://" ) ) and not ( self.urltoindex.startswith( "https://" ) ):
-				Output.error( "The URL to index.php must start with either http:// or https://!" )
-				sys.exit(1)
-			elif ( self.checkIndex() ):
-				Output.message( "index.php is okay" )
-			else:
-				Output.error( "There is an error with index.php, please provide a correct path to it." )
-				sys.exit(1)
-
-	def resumeDump(self):
-		"""
-		Resume an incomplete dump defined in self.path.
-		"""
-		# TODO: Add support for resuming dumps.
-		os.chdir( self.path )
-		self.loadConfig()
-		self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date )
-		self.domain = self.makeNiceURL( domain )
-		if ( self.useAPI == True ):
-			self.urltoindex = "%s/index.php" % ( self.domain )
-		self.tasklist = sorted( self.tasklist )
-		for task in self.tasklist:
-			if ( task == "axml" ):
-				DumpXML.run()
-			elif ( task == "bimages" ):
-				DumpImages.run()
-			elif ( task == "clogs" ):
-				DumpLogs.run()
-
-	def run(self):
-		"""
-		Run the whole script itself and excute important functions.
-		"""
-		print self.welcome()
-		Updater.checkRevision()
-		# Check if previously there was a log file in the working directory and remove it if exists
-		# This is followed by the equivalent of "touch" in Unix to create an empty file
-		if ( os.path.exists( Output.logfile ) ):
-			os.remove( Output.logfile )
-			open( Output.logfile, "a" ).close()
-		else:
-			open( Output.logfile, "a" ).close()
-		self.processargs()
-		if ( DumpGenerator.nolog or DumpGenerator.debugmode):
-			# Remove the dumpgenerator.log file
-			os.remove( Output.logfile )
-		if ( self.useAPI == True ):
-			domain = self.urltoapi
-		elif ( self.useIndex == True ):
-			domain = self.urltoindex
-		directories = os.walk( "." ).next()[1]
-		for directory in directories:
-			# Check if there is a dump that already exists in the current working directory
-			if ( directory.startswith( self.makePrefix( domain ) ) and directory.endswith( "-wikidump" ) ):
-				print "" # Create a blank line
-				Output.warn( "There seems to be a similar dump at %s which might be incomplete." % ( directory ) )
-				if ( self.autonomous == True ):
-					Output.message( "Since auto pilot mode is enabled, that dump will not be resumed." )
-					self.resume = False
-				else:
-					Output.warn( "Do you wish to resume using configuration from that dump? [yes, y], [no, n]" )
-					reply = ""
-					while reply.lower() not in [ "yes", "y", "no", "n" ]:
-						reply = raw_input( "Answer: " )
-					if ( reply.lower() in [ "yes", "y" ] ):
-						if not ( os.path.isfile( "%s/%s" % ( directory, self.configfile ) ) ):
-							Output.error( "I cannot find a %s in the directory! Please delete that directory before re-running!" % ( self.configfile ) )
-							sys.exit(1)
-						else:
-							Output.warn( "Resuming dump and ignoring configuration given in this session..." )
-							self.resume = True
-							self.path = directory
-							break
-					elif ( reply.lower() in [ "no", "n" ] ):
-						Output.message( "Not resuming..." )
-						self.resume = False
-			else:
-				continue
-		if ( self.resume == True ):
-			self.resumeDump()
-		else:
-			self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date )
-			self.domain = self.makeNiceURL( domain )
-			workingdir = "%s-wikidump" % ( self.prefix )
-			if ( os.path.exists( workingdir ) ):
-				if ( self.autonomous == True ):
-					Output.message( "Since auto pilot mode is enabled, the directory with the same name will be deleted." )
-					reply = "yes"
-				else:
-					Output.warn( "\nThere seems to be a directory with the same name, delete the old one? [yes, y], [no, n]" )
-					reply = ""
-				while reply.lower() not in [ "yes", "y", "no", "n" ]:
-					reply = raw_input( "Answer: " )
-				if ( reply.lower() in [ "yes", "y" ] ):
-					try:
-						shutil.rmtree( workingdir )
-					except:
-						Output.error( "There was a problem deleting the directory, please manually delete it before re-running!" )
-						sys.exit(1)
-					print "" # Create a blank line
-				elif ( reply.lower() in [ "no", "n" ] ):
-					Output.error( "Existing directory exists, either delete that directory or rename it before re-running!" )
-					sys.exit(1)
-			else:
-				pass
-			Output.message( "Generating a new dump into a new directory..." )
-			os.mkdir( workingdir )
-			os.rename( Output.logfile, "%s/%s" % ( workingdir, Output.logfile ) )
-			os.chdir( workingdir )
-			self.saveConfig()
-			# Guess the URL to index.php
-			if ( self.useAPI == True ):
-				self.urltoindex = "%s/index.php" % ( self.domain )
-			if ( self.debugmode == True ):
-				self.debug()
-			else:
-				# Run every single task that we are assigned to do in order: xml, images, logs
-				# The "a", "b" and "c" prefix is just to force the order.
-				self.tasklist = sorted( self.tasklist )
-				if ( self.tasklist == [] ):
-					Output.error( "You did not tell me what dump to create!" )
-				else:
-					for task in self.tasklist:
-						if ( task == "axml" ):
-							DumpXML.run()
-						elif ( task == "bimages" ):
-							DumpImages.run()
-						elif ( task == "clogs" ):
-							DumpLogs.run()
-					self.downloadHtmlPages()
-					print self.bye()
-
-	def saveConfig(self):
-		"""
-		Save the configuration settings provided.
-		"""
-		self.configoptions[ "date" ] = self.date
-		output = open( self.configfile, "w" )
-		json.dump( self.configoptions, output, indent=4 )
-
-	def version(self):
-		"""
-		Displays the version information and credits of the script.
-
-		Returns: Version information and credits
-		"""
-		message = """DumpGenerator %s by WikiTeam
-
-Copyright (C) 2013 Hydriz Scholz
-Copyright (C) 2014 WikiTeam
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program. If not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit
-<http://www.gnu.org/copyleft/gpl.html>
-""" % (self.Version)
-		return message
-
-	def welcome(self):
-		"""
-		Welcomes the user at the very beginning of the script running process.
-
-		Returns: Welcome message.
-		"""
-		message = """########## Welcome to DumpGenerator %s by WikiTeam ##########\n""" % (self.Version)
-		return message
-
-class DumpImages:
-	"""
-	The class for generating an image dump.
-	"""
-	def __init__(self):
-		"""
-		The constructor function.
-		"""
-		self.files = []
-
-	def dumpImages(self):
-		"""
-		Download all the images on the wiki with their corresponding XML.
-		"""
-		if ( DumpGenerator.useAPI == True ):
-			self.getFileListAPI()
-		else:
-			self.getFileListIndex()
-		filecount = 0
-		if ( self.files == [] ):
-			pass
-		else:
-			Output.message( "Downloading files and their descriptions into \"images\" directory..." )
-			for media in self.files:
-				time.sleep( DumpGenerator.delay ) # Delay between requests
-				urllib.urlretrieve( media[ "url" ], "images/%s" % (media[ "name" ] ) )
-				title = DumpGenerator.fixHTMLEntities( media[ "title" ].encode( "utf-8" ) )
-				contentsfile = DumpXML.getXMLPage( title, siteinfo=True )
-				destfile = "images/%s.xml" % ( media[ "name" ] )
-				shutil.move( contentsfile, destfile )
-				Output.appendToFile( destfile, "</mediawiki>\n" )
-				filecount += 1
-				if ( filecount % 10 == 0 ):
-					# Give the user a regular status report so that it does not look stuck
-					Output.message( "    Downloaded %d files." % ( filecount ) )
-			if ( filecount == 1 ):
-				Output.message( "Downloaded 1 file." % ( filecount ) )
-			else:
-				Output.message( "Downloaded %d files." % ( filecount ) )
-
-	def getFileListAPI(self):
-		"""
-		Download the list of files on the wiki via the API.
-		"""
-		files = []
-		dumpfile = "%s-images.txt" % ( DumpGenerator.prefix )
-		filecount = 0
-		Output.message( "Getting list of files on the wiki..." )
-		aifrom = "!" # Very first page of a wiki
-		while aifrom:
-			sys.stderr.write('.') # Tell the user that downloading is in progress
-			query = {
-				"list": "allimages",
-				"aifrom": aifrom,
-				"ailimit": 500 } # The default limit for anonymous users of the API is 500 pages per request
-			time.sleep( DumpGenerator.delay ) # Delay between requests
-			filesmeta = json.loads( RequestAPI.query( query ) )
-			# Store what the server tells us to continue from
-			try:
-				serveraifrom = filesmeta[ "query-continue" ][ "allimages" ][ "aicontinue" ]
-				aifrom = DumpGenerator.fixHTMLEntities( serveraifrom )
-			except:
-				# Reached the end of having to keep continuing, exit the while condition
-				aifrom = ""
-			# TODO: On a wiki with a lot of files, this can cause huge memory problems
-			files.extend( filesmeta[ "query" ][ "allimages" ] )
-			for media in filesmeta[ "query" ][ "allimages" ]:
-				outputline = "%s\t%s\n" % ( media[ "title" ], media[ "url" ] )
-				Output.appendToFile( dumpfile, outputline )
-			# Add to namespace page count
-			filecount += len( files )
-		Output.appendToFile( dumpfile, "--END--" )
-		if ( filecount == 1 ):
-			Output.message( "    Got 1 file" )
-		else:
-			Output.message( "    Got %d files" % ( filecount ) )
-
-		if ( filecount == 0 ):
-			Output.warn( "There are no files on the wiki to download!" )
-		else:
-			Output.message( "File names and URLs saved at %s." % ( dumpfile ) )
-		self.files = files
-
-	def getFileListIndex(self):
-		"""
-		Download the list of files on the wiki via index.php.
-		"""
-		# TODO: Add code here
-
-	def run(self):
-		"""
-		Execute the process of producing an image dump.
-		"""
-		if ( os.path.isdir( "images" ) ):
-			time.sleep(0)
-		else:
-			os.mkdir( "images" )
-		self.dumpImages()
-
-class DumpLogs:
-	"""
-	The class for generating a log pages dump (pages in Special:Log).
-	"""
-	def __init__(self):
-		"""
-		The constructor function.
-		"""
-
-	def run(self):
-		"""
-		Execute the process of producing a log pages dump.
-		"""
-		# TODO: Support downloading of log pages
-		Output.warn( "Sorry, downloading of log pages are not yet supported!" )
-
-class DumpXML:
-	"""
-	The class for generating an XML dump.
-	"""
-	def __init__(self):
-		"""
-		The constructor function.
-		"""
-		self.lennamespaces = 0
-		self.namespaces = {}
-		self.pagetitles = []
-		self.titlesdumpfile = ""
-		self.dumpretrycount = 0
-
-	def dumpPageTitlesAPI(self):
-		"""
-		Get a list of page titles and outputs it to a file.
-		"""
-		self.getNamespacesAPI()
-		self.getPageTitlesAPI()
-		Output.message( "Saving list of page titles..." )
-		Output.appendToFile( self.titlesdumpfile, "--END--" )
-		Output.message( "List of page titles saved at %s." % ( self.titlesdumpfile ) )
-
-	def dumpXML(self):
-		"""
-		Get the whole wiki in an XML file.
-		"""
-		Output.message( "Downloading the XML of every page..." )
-		if ( DumpGenerator.curonly == True ):
-			dumpfile = "%s-curonly.xml" % ( DumpGenerator.prefix )
-		else:
-			dumpfile = "%s-history.xml" % ( DumpGenerator.prefix )
-		pagecount = 0
-		# To reduce memory usage, we are storing the title into memory only when we need it
-		for title in file( self.titlesdumpfile, "r" ).read().splitlines():
-			pagecount += 1
-			numberofedits = 0
-			# Add the initial siteinfo and header tags for the first page
-			if ( pagecount == 1 ):
-				contentsfile = self.getXMLPage( title, siteinfo=True )
-				contents = file( contentsfile, "r" ).readlines()
-				open( dumpfile, "a" ).close() # "touch" the file
-				os.remove( contentsfile )
-			elif ( title == "--END--" ):
-				contents = [ "</mediawiki>\n" ]
-			else:
-				contentsfile = self.getXMLPage( title )
-				contents = file( contentsfile, "r" ).readlines()
-				os.remove( contentsfile )
-
-			for content in contents:
-				# Count the number of occurrences of "<timestamp>" to determine number of revisions
-				if ( "<timestamp>" in content ):
-					numberofedits += 1
-				Output.appendToFile( dumpfile, content )
-			if ( title == "--END--" ):
-				pass
-			else:
-				if ( numberofedits == 1 ):
-					Output.message( "    %s, 1 edit" % ( title ) )
-				else:
-					Output.message( "    %s, %s edits" % ( title, numberofedits ) )
-			if ( pagecount % 10 == 0 ):
-				Output.message( "Downloaded %d pages" % ( pagecount ) )
-		Output.message( "XML dump saved at %s." % ( dumpfile ) )
-		self.integrityCheck( dumpfile )
-
-	def getNamespacesAPI(self):
-		"""
-		Download the list of namespaces with their names and IDs
-		via the API.
-		"""
-		query = {
-			"meta": "siteinfo",
-			"siprop": "namespaces" }
-		namespacedetails = json.loads( RequestAPI.query( query ) )
-		namespacenums = namespacedetails[ "query" ][ "namespaces" ].keys()
-		# Remove the system namespaces ("Media" and "Special")
-		namespacenums.remove( "-2" )
-		namespacenums.remove( "-1" )
-		namespaces = {}
-		for namespacenum in namespacenums:
-			namespacename = namespacedetails[ "query" ][ "namespaces" ][ namespacenum ][ "*" ]
-			namespaces[ namespacenum ] = namespacename
-		self.lennamespaces = len( list( namespacenums ) )
-		Output.message( "%d namespaces found." % ( self.lennamespaces ) )
-		self.namespaces = namespaces
-
-	def getPageTitlesAPI(self):
-		"""
-		Grab a list of page titles in each namespace via the API.
-		
-		There are leading spaces in the outputs so as to make things neater on the terminal.
-		"""
-		titles = []
-		self.titlesdumpfile = "%s-titles.txt" % ( DumpGenerator.prefix )
-		totalpagecount = 0
-		for namespace in self.namespaces:
-			if namespace in DumpGenerator.exnamespaces:
-				Output.warn( "    Skipping namespace %s" % (namespace) )
-			else:
-				pagecount = 0
-				Output.message( "    Getting titles in namespace %s" % (namespace) )
-				apfrom = "!" # Very first page of a wiki
-				while apfrom:
-					sys.stderr.write( "." ) # Tell the user that downloading is in progress
-					query = {
-						"list": "allpages",
-						"apnamespace": namespace,
-						"apfrom": apfrom,
-						"aplimit": 500 } # The default limit for anonymous users of the API is 500 pages per request
-					time.sleep( DumpGenerator.delay ) # Delay between requests
-					pagetitles = json.loads( RequestAPI.query( query ) )
-					# Store what the server tells us to continue from
-					try:
-						serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apcontinue" ]
-						apfrom = DumpGenerator.fixHTMLEntities( serverapfrom )
-					except:
-						try:
-							serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apfrom" ]
-							apfrom = DumpGenerator.fixHTMLEntities( serverapfrom )
-						except:
-							# Reached the end of having to keep continuing, exit the while condition
-							apfrom = ""
-					pages = pagetitles[ "query" ][ "allpages" ]
-					# Add to namespace page count
-					pagecount += len( pages )
-					for page in pages:
-						title = "%s\n" % ( page[ "title" ] )
-						Output.appendToFile( self.titlesdumpfile, title )
-				if ( pagecount == 1 ):
-					Output.message( "    Got 1 page title in namespace %s" % ( namespace ) )
-				else:
-					Output.message( "    Got %d page titles in namespace %s" % ( pagecount, namespace ) )
-				# Add to total page count
-				totalpagecount += pagecount
-		if ( totalpagecount == 1 ):
-			Output.message( "Got 1 page title in total." % ( totalpagecount ) )
-		else:
-			Output.message( "Got %d page titles in total." % ( totalpagecount ) )
-
-	def getXMLPage(self, page, siteinfo=False):
-		"""
-		Get the XML of one page.
-		
-		Input:
-		 - page: The title of the page to download.
-		 - siteinfo: Whether to include the siteinfo header in the XML.
-		"""
-		parameters = {
-			"title": "Special:Export",
-			"pages": page,
-			"action": "submit" }
-		if ( DumpGenerator.curonly == True ):
-			parameters[ "curonly" ] = 1
-			parameters[ "limit" ] = 1
-		else:
-			# Make the wiki download the actual full history
-			parameters["history"] = "1"
-		# TODO: Can cause memory problems if the page has a huge history
-		result = RequestIndex.query( parameters )
-		pagehash = hashlib.sha256( page ).hexdigest()[:8]
-		tempfile = "%s.xml.tmp" % ( pagehash )
-		tempfile2 = "%s.xml" % ( pagehash )
-		Output.appendToFile( tempfile, result )
-		result = "" # Free up memory
-		# Warning: The following is NOT compatible with MediaWiki XML Schema Description version 0.3 and below!
-		# See http://wikiteam.googlecode.com/svn/trunk/schema/README.md for more information about MediaWiki versions 
-		# this will affect and ways to overcome it.
-		if ( siteinfo == False ):
-			linecount = 0
-			# The 11 comes from lines like <siteinfo>, "special" namespaces and the very first line
-			# TODO: Hacky way of removing the siteinfo, check for backward compatibility!
-			linestoskip = 11 + self.lennamespaces
-			for line in open( tempfile, "r" ).read().splitlines():
-				linecount += 1
-				if linecount > linestoskip:
-					if ( "</mediawiki>" in line ):
-						pass
-					else:
-						line = "%s\n" % ( line )
-						Output.appendToFile( tempfile2, line )
-				else:
-					continue
-		else:
-			for line in open( tempfile, "r" ).read().splitlines():
-				if ( "</mediawiki>" in line ):
-					pass
-				else:
-					line = "%s\n" % ( line )
-					Output.appendToFile( tempfile2, line )
-		os.remove( tempfile )
-		return tempfile2
-
-	def integrityCheck(self, dumpfile):
-		"""
-		Checks the integrity of the XML dump and ensures that it is not corrupted.
-		"""
-		Output.message( "Checking the integrity of the XML dump..." )
-		checktitles = 0
-		checkpageopen = 0
-		checkpageclose = 0
-		checkrevisionopen = 0
-		checkrevisionclose = 0
-		# Check the number of instances of the following tags
-		# By logic they should be the same number
-		for line in file( dumpfile, "r" ).read().splitlines():
-			if "<title>" in line:
-				checktitles += 1
-			elif "<page>" in line:
-				checkpageopen += 1
-			elif "</page>" in line:
-				checkpageclose += 1
-			elif "<revision>" in line:
-				checkrevisionopen += 1
-			elif "</revision>" in line:
-				checkrevisionclose += 1
-			else:
-				continue
-
-		if ( checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose ):
-			Output.message( "Excellent, the XML dump is not corrupted." )
-		else:
-			Output.warn( "WARNING: XML dump seems to be corrupted." )
-			if ( DumpGenerator.autonomous == True ):
-				reply = "yes"
-			else:
-				reply = ""
-			while reply.lower() not in [ "yes", "y", "no", "n" ]:
-				reply = raw_input( 'Regenerate a new dump ([yes, y], [no, n])? ' )
-			if reply.lower() in [ "yes", "y" ]:
-				self.dumpretrycount += 1
-				if ( self.dumpretrycount < 3 ):
-					Output.warn( "Generating a new dump..." )
-					os.remove( dumpfile )
-					self.dumpXML()
-				else:
-					Output.warn( "We have tried dumping the wiki 3 times, but the dump is still corrupted. Not going to carry on since it is probably a problem on the wiki." )
-					# Encourage the user to tell us about this faulty wiki
-					print "Please tell us about this by reporting an issue here: https://code.google.com/p/wikiteam/issues/list. Thank you!"
-					print "Giving you a little time to see this message..."
-					time.sleep(3) # Give time for the user to see the message
-			elif reply.lower() in [ "no", "n" ]:
-				Output.warn( "Not generating a new dump. Note: Your dump is corrupted and might not work with MediaWiki!" )
-
-	def run(self):
-		"""
-		Execute the process of producing an XML dump.
-		"""
-		if ( DumpGenerator.useAPI == True ):
-			if ( DumpGenerator.titlesonly == True ):
-				self.dumpPageTitlesAPI()
-			else:
-				if ( DumpGenerator.titles != "" ):
-					Output.message( "Using the list of page titles provided at %s." % ( DumpGenerator.titles ) )
-					self.titlesdumpfile = DumpGenerator.titles
-				else:
-					self.dumpPageTitlesAPI()
-				self.dumpXML()
-		else:
-			if ( DumpGenerator.titlesonly == True ):
-				self.dumpPageTitlesIndex()
-			else:
-				if ( DumpGenerator.titles != "" ):
-					self.titlesdumpfile = DumpGenerator.titles
-				else:
-					self.dumpPageTitlesIndex()
-				self.dumpXML()
-
-class Output:
-	"""
-	The class to output anything to the user or to a place not within the script.
-
-	For doing outputs to user:
-		This is used instead of directly using the "print" function is because 
-		this is intended to log everything that is told to the user, so that it 
-		is possible to check when and where things went wrong.
-
-	For doing outputs to elsewhere:
-		This is to reduce memory usage by storing large chunks of data into disk 
-		and reducing the risk of getting a MemoryError.
-	"""
-	def __init__(self):
-		self.logfile = "dumpgenerator.log"
-
-	# Output to disk
-	def appendToFile(self, outputfile, contents):
-		"""
-		Output contents to file.
-
-		Inputs:
-		 - outputfile: The file to output to.
-		 - contents: The content to add for each line.
-		"""
-		if ( os.path.exists( outputfile ) == False ):
-			open( outputfile, "a" ).close() # "touch" the file
-		else:
-			pass
-		thefile = open( outputfile, "a" )
-		try:
-			contents = contents.encode( "utf-8", "ignore" )
-		# TODO: During a test phase, this error kept coming up, though the final output was no different from
-		# what was produced using dumpBackup.php and using Special:Export itself.
-		except UnicodeDecodeError:
-			pass
-		thefile.write( contents )
-		thefile.close()
-
-	# Output to user
-	def error(self, message):
-		print message
-		print "Write --help for more information."
-		self.log( "An error occurred: %s" % (message) )
-
-	def log(self, message):
-		if ( DumpGenerator.nolog or DumpGenerator.debugmode):
-			# Skip logging
-			time.sleep(0)
-		else:
-			timestamp = datetime.datetime.fromtimestamp( time.time() ).strftime( "%Y-%m-%d %H:%M:%S" )
-			logline = "%s: %s\n" % (timestamp, message)
-			self.appendToFile( self.logfile, logline )
-
-	def message(self, message):
-		print message
-		self.log( "Told the user: %s" % (message) )
-
-	def warn(self, message):
-		print message
-		self.log( "Warned the user: %s" % (message) )
-
-class RequestAPI:
-	"""
-	The RequestAPI class, to submit APi request calls to the server.
-	"""
-	def __init__(self):
-		"""
-		The constructor function.
-		"""
-
-	def query(self, params, url=""):
-		"""
-		The function to send an API call to the server given in the "url" 
-		parameter using the parameters found in params. If url is empty, 
-		DumpGenerator.urltoapi is used instead.
-
-		Note: This function will assume action=query, other functions provides 
-		the other query forms, but not this one.
-
-		Input:
-		 - params: Parameters to API call as an array (excluding action=query and format=json)
-
-		Returns
-		 - Result of API call in JSON format.
-		"""
-		if ( url == "" ):
-			url = DumpGenerator.urltoapi
-		else:
-			url = url
-		queryurl = "%s?action=query&format=json" % ( url )
-		headers = { "User-Agent": DumpGenerator.UserAgent }
-		# Convert the array to a proper URL
-		paras = urllib.urlencode( params )
-		# POST the parameters to the server
-		request = urllib2.Request( queryurl, paras, headers )
-		try:
-			result = urllib2.urlopen( request )
-		except:
-			try:
-				# Add a little delay between requests if server is slow
-				sleeptime = DumpGenerator.delay + 10
-				Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) )
-				time.sleep( sleeptime )
-				result = urllib2.urlopen( request )
-			except:
-				Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." )
-				sys.exit(2)
-		output = result.read()
-		result.close()
-		return output
-
-class RequestIndex:
-	def __init__(self):
-		"""
-		The constructor function.
-		"""
-
-	def query(self, params, url=""):
-		"""
-		The function to send an request to the server given in the "url" 
-		parameter using the parameters found in params. If url is empty, 
-		DumpGenerator.urltoindex is used instead.
-
-		Input:
-		 - params: Parameters to the request to send, appended to url as 
-		   a GET request.
-
-		Returns
-		 - Result of GET request.
-		"""
-		if ( url == "" ):
-			url = DumpGenerator.urltoindex
-		else:
-			url = url
-		headers = { "User-Agent": DumpGenerator.UserAgent }
-		paras = urllib.urlencode( params )
-		# index.php does not support POST request, formulating a correct GET URL here
-		queryurl = "%s?%s" % ( url, paras )
-		request = urllib2.Request( queryurl, headers=headers )
-		# TODO: Make urlopen follow redirects
-		try:
-			result = urllib2.urlopen( request )
-		except:
-			try:
-				# Add a little delay between requests if server is slow
-				sleeptime = DumpGenerator.delay + 10
-				Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) )
-				time.sleep( sleeptime )
-				result = urllib2.urlopen( request )
-			except:
-				Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." )
-				sys.exit(2)
-		output = result.read()
-		result.close()
-		return output
-
-	def removeIP(self, content):
-		"""
-		Remove the user's IP address while fetching HTML pages.
-		"""
-		# Remove IPv4 addresses
-		content = re.sub( r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", content )
-		# Remove IPv6 addresses
-		content = re.sub( r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}", "0:0:0:0:0:0:0:0", content )
-		return content
-
-class Updater:
-	"""
-	The class to auto-update the user's script to the latest version of DumpGenerator.
-	"""
-	# TODO: Get the script to check only occasionally, this is a performance concern
-	def __init__(self):
-		"""
-		The constructor function.
-		"""
-		self.controlUrl = "http://wikiteam.googlecode.com/svn/trunk/revnum.json"
-		self.controlUrl2 = "https://raw.github.com/dumps/DumpGenerator/master/revnum.json"
-		self.result = {}
-
-	def checkRevision(self):
-		"""
-		Check the current revision and ensure that it is up-to-date.
-		"""
-		jsonresult = self.getRevisionJson()
-		if ( jsonresult == False ):
-			pass
-		else:
-			result = json.loads( jsonresult )
-			self.result = result
-			if ( result[ "latest" ] == DumpGenerator.Version ):
-				if ( result[ "releases" ][ DumpGenerator.Version ][ "revision" ] == DumpGenerator.revision ):
-					pass
-				else:
-					self.update()
-			else:
-				self.update()
-
-	def getRevisionJson(self):
-		"""
-		Download the controlling JSON file.
-		"""
-		headers = {'User-Agent': DumpGenerator.UserAgent}
-		skip = False
-		# TODO: Handle 404 errors
-		try:
-			revjson = urllib2.urlopen( urllib2.Request( self.controlUrl, headers=headers ) )
-		except:
-			try:
-				revjson = urllib2.urlopen( urllib2.Request( self.controlUrl2, headers=headers ) )
-			except:
-				Output.warn( "Unable to check if a new version of dumpgenerator.py is available, continuing..." )
-				skip = True
-		if ( skip == False ):
-			output = revjson.read()
-			revjson.close()
-			return output
-		else:
-			return False
-
-	def update(self):
-		"""
-		Update DumpGenerator.py to the current latest version
-		"""
-		currentfile = sys.argv[0]
-		latestver = self.result[ "latest" ]
-		latestrev = self.result[ "releases" ][ latestver ][ "revision" ]
-		latesturl = self.result[ "releases" ][ latestver ][ "downloadurl" ]
-		latesturl2 = self.result[ "releases" ][ latestver ][ "downloadurl2" ]
-		updated = True
-		# TODO: Handle 404 errors
-		try:
-			urllib.urlretrieve( latesturl, currentfile )
-		except:
-			try:
-				urllib.urlretrieve( latesturl2, currentfile )
-			except:
-				updated = False
-		if ( updated == False ):
-			Output.warn( "Unable to update DumpGenerator, skipping update for now..." )
-		else:
-			Output.message( "DumpGenerator was updated to %s (revision %s)! Changes will take effect on next run." % ( latestver, latestrev ) )
-
-if __name__ == "__main__":
-	# Class registry, for use throughout the whole script
-	RequestAPI = RequestAPI()
-	RequestIndex = RequestIndex()
-	DumpGenerator = DumpGenerator()
-	DumpImages = DumpImages()
-	DumpLogs = DumpLogs()
-	DumpXML = DumpXML()
-	Output = Output()
-	Updater = Updater()
-
-	# Start everything up
-	DumpGenerator.run()

From 3929e4eb9c55da11e4f589b6fa0cc17c8a550274 Mon Sep 17 00:00:00 2001
From: balr0g <balrog032@gmail.com>
Date: Thu, 3 Jul 2014 14:23:21 -0400
Subject: [PATCH 2/2] Cleanups and error fixes suggested by flake8 (pep8 +
 pyflakes)

---
 dumpgenerator.py | 809 +++++++++++++++++++++++++++++------------------
 1 file changed, 495 insertions(+), 314 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index cd6a58e..774bac9 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -7,12 +7,12 @@
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
@@ -40,11 +40,11 @@ try:
 except ImportError:
     print "Please install or update the Requests module."
     sys.exit(1)
-import subprocess
 import time
 import urllib
 
-__VERSION__ = '0.2.2' #major, minor, micro
+__VERSION__ = '0.2.2'  # major, minor, micro
+
 
 def getVersion():
     return(__VERSION__)
@@ -54,23 +54,28 @@ def truncateFilename(other={}, filename=''):
     """ Truncate filenames when downloading images with large filenames """
     return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1]
 
+
 def delay(config={}, session=None):
     """ Add a delay if configured for that """
     if config['delay'] > 0:
         print 'Sleeping... %d seconds...' % (config['delay'])
         time.sleep(config['delay'])
 
+
 def cleanHTML(raw=''):
     """ Extract only the real wiki content and remove rubbish """
     """ This function is ONLY used to retrieve page titles and file names when no API is available """
     """ DO NOT use this function to extract page content """
-    #different "tags" used by different MediaWiki versions to mark where starts and ends content
+    # different "tags" used by different MediaWiki versions to mark where
+    # starts and ends content
     if re.search('<!-- bodytext -->', raw):
         raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]
     elif re.search('<!-- start content -->', raw):
-        raw = raw.split('<!-- start content -->')[1].split('<!-- end content -->')[0]
+        raw = raw.split(
+            '<!-- start content -->')[1].split('<!-- end content -->')[0]
     elif re.search('<!-- Begin Content Area -->', raw):
-        raw = raw.split('<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0]
+        raw = raw.split(
+            '<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0]
     elif re.search('<!-- content -->', raw):
         raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0]
     elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw):
@@ -81,6 +86,7 @@ def cleanHTML(raw=''):
         sys.exit()
     return raw
 
+
 def handleStatusCode(response):
     statuscode = response.status_code
     if statuscode >= 200 and statuscode < 300:
@@ -113,58 +119,66 @@ def handleStatusCode(response):
         print response.url
         sys.exit(1)
 
+
 def getNamespacesScraper(config={}, session=None):
     """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
     """ Function called if no API is available """
     namespaces = config['namespaces']
-    namespacenames = {0:''} # main is 0, no prefix
+    namespacenames = {0: ''}  # main is 0, no prefix
     if namespaces:
-        r = session.post(url=config['index'], data={'title': 'Special:Allpages'})
+        r = session.post(
+            url=config['index'], data={'title': 'Special:Allpages'})
         raw = r.text
         delay(config=config, session=session)
 
-        m = re.compile(r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw) # [^>]*? to include selected="selected"
+        # [^>]*? to include selected="selected"
+        m = re.compile(
+            r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw)
         if 'all' in namespaces:
             namespaces = []
             for i in m:
                 namespaces.append(int(i.group("namespaceid")))
-                namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
+                namespacenames[int(i.group("namespaceid"))] = i.group(
+                    "namespacename")
         else:
-            #check if those namespaces really exist in this wiki
+            # check if those namespaces really exist in this wiki
             namespaces2 = []
             for i in m:
                 if int(i.group("namespaceid")) in namespaces:
                     namespaces2.append(int(i.group("namespaceid")))
-                    namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
+                    namespacenames[int(i.group("namespaceid"))] = i.group(
+                        "namespacename")
             namespaces = namespaces2
     else:
         namespaces = [0]
-    
-    namespaces = list(set(namespaces)) #uniques
+
+    namespaces = list(set(namespaces))  # uniques
     print '%d namespaces found' % (len(namespaces))
     return namespaces, namespacenames
-    
+
+
 def getNamespacesAPI(config={}, session=None):
     """ Uses the API to get the list of namespaces names and ids """
     namespaces = config['namespaces']
-    namespacenames = {0:''} # main is 0, no prefix
+    namespacenames = {0: ''}  # main is 0, no prefix
     if namespaces:
-        r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'})
+        r = session.post(url=config['api'], data={
+                         'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'})
         result = json.loads(r.text)
         delay(config=config, session=session)
 
         if 'all' in namespaces:
             namespaces = []
             for i in result['query']['namespaces'].keys():
-                if int(i) < 0: # -1: Special, -2: Media, excluding
+                if int(i) < 0:  # -1: Special, -2: Media, excluding
                     continue
                 namespaces.append(int(i))
                 namespacenames[int(i)] = result['query']['namespaces'][i]['*']
         else:
-            #check if those namespaces really exist in this wiki
+            # check if those namespaces really exist in this wiki
             namespaces2 = []
             for i in result['query']['namespaces'].keys():
-                if int(i) < 0: # -1: Special, -2: Media, excluding
+                if int(i) < 0:  # -1: Special, -2: Media, excluding
                     continue
                 if int(i) in namespaces:
                     namespaces2.append(int(i))
@@ -172,41 +186,46 @@ def getNamespacesAPI(config={}, session=None):
             namespaces = namespaces2
     else:
         namespaces = [0]
-    
-    namespaces = list(set(namespaces)) #uniques
+
+    namespaces = list(set(namespaces))  # uniques
     print '%d namespaces found' % (len(namespaces))
     return namespaces, namespacenames
 
+
 def getPageTitlesAPI(config={}, session=None):
     """ Uses the API to get the list of page titles """
     titles = []
-    namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
+    namespaces, namespacenames = getNamespacesAPI(
+        config=config, session=session)
     for namespace in namespaces:
         if namespace in config['exnamespaces']:
             print '    Skipping namespace = %d' % (namespace)
             continue
-        
+
         c = 0
         print '    Retrieving titles in the namespace %d' % (namespace)
         apfrom = '!'
         while apfrom:
-            sys.stderr.write('.') #progress
-            params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500}
+            sys.stderr.write('.')  # progress
+            params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace,
+                      'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500}
             r = session.post(url=config['api'], data=params)
             handleStatusCode(r)
-            #FIXME Handle HTTP errors here!
+            # FIXME Handle HTTP errors here!
             jsontitles = json.loads(r.text)
             apfrom = ''
-            if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'):
-                if jsontitles['query-continue']['allpages'].has_key('apcontinue'):
-                    apfrom = jsontitles['query-continue']['allpages']['apcontinue'] 
-                elif jsontitles['query-continue']['allpages'].has_key('apfrom'):
+            if 'query-continue' in jsontitles and 'allpages' in jsontitles['query-continue']:
+                if 'apcontinue' in jsontitles['query-continue']['allpages']:
+                    apfrom = jsontitles['query-continue']['allpages']['apcontinue']
+                elif 'apfrom' in jsontitles['query-continue']['allpages']:
                     apfrom = jsontitles['query-continue']['allpages']['apfrom']
-            #print apfrom
-            #print jsontitles
-            titles += [page['title'] for page in jsontitles['query']['allpages']]
+            # print apfrom
+            # print jsontitles
+            titles += [page['title']
+                       for page in jsontitles['query']['allpages']]
             if len(titles) != len(set(titles)):
-                #probably we are in a loop, server returning dupe titles, stop it
+                # probably we are in a loop, server returning dupe titles, stop
+                # it
                 print 'Probably a loop, finishing'
                 titles = list(set(titles))
                 apfrom = ''
@@ -215,17 +234,20 @@ def getPageTitlesAPI(config={}, session=None):
         print '    %d titles retrieved in the namespace %d' % (c, namespace)
     return titles
 
+
 def getPageTitlesScraper(config={}, session=None):
     """  """
     titles = []
-    namespaces, namespacenames = getNamespacesScraper(config=config, session=session)
+    namespaces, namespacenames = getNamespacesScraper(
+        config=config, session=session)
     for namespace in namespaces:
         print '    Retrieving titles in the namespace', namespace
-        url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
+        url = '%s?title=Special:Allpages&namespace=%s' % (
+            config['index'], namespace)
         r = session.get(url=url)
         raw = r.text
         raw = cleanHTML(raw)
-        
+
         r_title = r'title="(?P<title>[^>]+)">'
         r_suballpages = ''
         r_suballpages1 = r'&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
@@ -235,177 +257,212 @@ def getPageTitlesScraper(config={}, session=None):
         elif re.search(r_suballpages2, raw):
             r_suballpages = r_suballpages2
         else:
-            pass #perhaps no subpages
-        
-        deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
+            pass  # perhaps no subpages
+
+        # 3 is the current deep of English Wikipedia for Special:Allpages, 3
+        # levels
+        deep = 3
         c = 0
         checked_suballpages = []
         rawacum = raw
         while r_suballpages and re.search(r_suballpages, raw) and c < deep:
-            #load sub-Allpages
+            # load sub-Allpages
             m = re.compile(r_suballpages).finditer(raw)
             for i in m:
                 fr = i.group('from')
-                
+
                 if r_suballpages == r_suballpages1:
                     to = i.group('to')
                     name = '%s-%s' % (fr, to)
-                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (config['index'], namespace, fr, to) #do not put urllib.quote in fr or to
-                elif r_suballpages == r_suballpages2: #fix, esta regexp no carga bien todas? o falla el r_title en este tipo de subpag? (wikiindex)
-                    fr = fr.split('&amp;namespace=')[0] #clean &amp;namespace=\d, sometimes happens
+                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
+                        config['index'], namespace, fr, to)  # do not put urllib.quote in fr or to
+                # fix, esta regexp no carga bien todas? o falla el r_title en
+                # este tipo de subpag? (wikiindex)
+                elif r_suballpages == r_suballpages2:
+                    # clean &amp;namespace=\d, sometimes happens
+                    fr = fr.split('&amp;namespace=')[0]
                     name = fr
-                    url = '%s?title=Special:Allpages/%s&namespace=%s' % (config['index'], name, namespace)
-                
-                if not name in checked_suballpages:
-                    checked_suballpages.append(name) #to avoid reload dupe subpages links
+                    url = '%s?title=Special:Allpages/%s&namespace=%s' % (
+                        config['index'], name, namespace)
+
+                if name not in checked_suballpages:
+                    # to avoid reload dupe subpages links
+                    checked_suballpages.append(name)
                     delay(config=config, session=session)
                     r2 = session.get(url=url)
                     raw2 = r2.text
                     raw2 = cleanHTML(raw2)
-                    rawacum += raw2 #merge it after removed junk
+                    rawacum += raw2  # merge it after removed junk
                     print '    Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'
 
                 delay(config=config, session=session)
             c += 1
-        
+
         c = 0
         m = re.compile(r_title).finditer(rawacum)
         for i in m:
             t = undoHTMLEntities(text=i.group('title'))
             if not t.startswith('Special:'):
-                if not t in titles:
+                if t not in titles:
                     titles.append(t)
                     c += 1
         print '    %d titles retrieved in the namespace %d' % (c, namespace)
     return titles
 
+
 def getPageTitles(config={}, session=None):
     """ Get list of page titles """
-    #http://en.wikipedia.org/wiki/Special:AllPages
-    #http://archiveteam.org/index.php?title=Special:AllPages
-    #http://www.wikanda.es/wiki/Especial:Todas
+    # http://en.wikipedia.org/wiki/Special:AllPages
+    # http://archiveteam.org/index.php?title=Special:AllPages
+    # http://www.wikanda.es/wiki/Especial:Todas
     print 'Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None')
     print 'Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None')
-    
+
     titles = []
     if config['api']:
         titles = getPageTitlesAPI(config=config, session=session)
     elif config['index']:
         titles = getPageTitlesScraper(config=config, session=session)
-    
-    titles = list(set(titles)) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace))
-    titles.sort() #sorting
-    
+
+    # removing dupes (e.g. in CZ appears Widget:AddThis two times (main
+    # namespace and widget namespace))
+    titles = list(set(titles))
+    titles.sort()  # sorting
+
     print '%d page titles loaded' % (len(titles))
     return titles
 
+
 def getXMLHeader(config={}, session=None):
     """ Retrieve a random page to extract XML headers (namespace info, etc) """
-    #get the header of a random page, to attach it in the complete XML backup
-    #similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....
-    randomtitle = 'Main_Page' #previously AMF5LKE43MNFGHKSDMRTJ
-    xml = getXMLPage(config=config, title=randomtitle, verbose=False, session=session)
+    # get the header of a random page, to attach it in the complete XML backup
+    # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
+    # xmlns:x....
+    randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
+    xml = getXMLPage(
+        config=config, title=randomtitle, verbose=False, session=session)
     header = xml.split('</mediawiki>')[0]
     if not xml:
         print 'XML export on this wiki is broken, quitting.'
         sys.exit()
     return header
 
+
 def getXMLFileDesc(config={}, title='', session=None):
     """ Get XML for image description page """
-    config['curonly'] = 1 #tricky to get only the most recent desc
+    config['curonly'] = 1  # tricky to get only the most recent desc
     return getXMLPage(config=config, title=title, verbose=False, session=session)
 
+
 def getUserAgent():
     """ Return a cool user-agent to hide Python user-agent """
     useragents = [
-        #firefox
-        'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0', 
+        # firefox
+        'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0',
         'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0',
-        ]
+    ]
     return useragents[0]
 
+
 def logerror(config={}, text=''):
     """ Log error in file """
     if text:
         with open('%s/errors.log' % (config['path']), 'a') as outfile:
-            output = u'%s: %s\n' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
+            output = u'%s: %s\n' % (
+                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
             outfile.write(output.encode('utf-8'))
 
+
 def getXMLPageCore(headers={}, params={}, config={}, session=None):
     """  """
-    #returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
-    #if retrieving params['limit'] revisions fails, returns a current only version
-    #if all fail, returns the empty string
+    # returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
+    # if retrieving params['limit'] revisions fails, returns a current only version
+    # if all fail, returns the empty string
     xml = ''
     c = 0
-    maxseconds = 100 #max seconds to wait in a single sleeping
-    maxretries = 5 # x retries and skip
-    increment = 20 #increment every retry
+    maxseconds = 100  # max seconds to wait in a single sleeping
+    maxretries = 5  # x retries and skip
+    increment = 20  # increment every retry
     while not re.search(r'</mediawiki>', xml):
         if c > 0 and c < maxretries:
-            wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds
+            wait = increment * c < maxseconds and increment * \
+                c or maxseconds  # incremental until maxseconds
             print '    XML for "%s" is wrong. Waiting %d seconds and reloading...' % (params['pages'], wait)
             time.sleep(wait)
-            if params['limit'] > 1: # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function)
-                params['limit'] = params['limit'] / 2 # half
+            # reducing server load requesting smallest chunks (if curonly then
+            # limit = 1 from mother function)
+            if params['limit'] > 1:
+                params['limit'] = params['limit'] / 2  # half
         if c >= maxretries:
             print '    We have retried %d times' % (c)
             print '    MediaWiki error for "%s", network error or whatever...' % (params['pages'])
             # If it's not already what we tried: our last chance, preserve only the last revision...
             # config['curonly'] means that the whole dump is configured to save nonly the last
-            # params['curonly'] should mean that we've already tried this fallback, because it's set by the following if and passed to getXMLPageCore
-            if not config['curonly']: 
+            # params['curonly'] should mean that we've already tried this
+            # fallback, because it's set by the following if and passed to
+            # getXMLPageCore
+            if not config['curonly']:
                 print '    Trying to save only the last revision for this page...'
                 params['curonly'] = 1
-                logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (params['pages']))
+                logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (
+                    params['pages']))
                 return getXMLPageCore(headers=headers, params=params, config=config)
             else:
                 print '    Saving in the errors log, and skipping...'
-                logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (params['pages']))
-                return '' # empty xml
-        #FIXME HANDLE HTTP Errors HERE
+                logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (
+                    params['pages']))
+                return ''  # empty xml
+        # FIXME HANDLE HTTP Errors HERE
         r = session.post(url=config['index'], data=params, headers=headers)
         handleStatusCode(r)
         xml = r.text
         c += 1
-    
+
     return xml
 
+
 def getXMLPage(config={}, title='', verbose=True, session=None):
     """ Get the full history (or current only) of a page """
 
-    #if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
-    #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
-    
+    # if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
+    # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
+
     limit = 1000
     truncated = False
     title_ = title
     title_ = re.sub(' ', '_', title_)
-    #do not convert & into %26, title_ = re.sub('&', '%26', title_)
+    # do not convert & into %26, title_ = re.sub('&', '%26', title_)
     params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
     if config['curonly']:
         params['curonly'] = 1
         params['limit'] = 1
     else:
-        params['offset'] = '1' # 1 always < 2000s
+        params['offset'] = '1'  # 1 always < 2000s
         params['limit'] = limit
-    if config.has_key('templates') and config['templates']: #in other case, do not set params['templates']
+    # in other case, do not set params['templates']
+    if 'templates' in config and config['templates']:
         params['templates'] = 1
-    
+
     xml = getXMLPageCore(params=params, config=config, session=session)
 
-    #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
-    #else, warning about Special:Export truncating large page histories
+    # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
+    # else, warning about Special:Export truncating large page histories
     r_timestamp = r'<timestamp>([^<]+)</timestamp>'
-    if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
-        while not truncated and params['offset']: #next chunk
-            params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML
-            xml2 = getXMLPageCore(params=params, config=config, session=session)
-            
-            if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>?
+    # search for timestamps in xml to avoid analysing empty pages like
+    # Special:Allpages and the random one
+    if not config['curonly'] and re.search(r_timestamp, xml):
+        while not truncated and params['offset']:  # next chunk
+            # get the last timestamp from the acum XML
+            params['offset'] = re.findall(r_timestamp, xml)[-1]
+            xml2 = getXMLPageCore(
+                params=params, config=config, session=session)
+
+            # are there more edits in this next XML chunk or no <page></page>?
+            if re.findall(r_timestamp, xml2):
                 if re.findall(r_timestamp, xml2)[-1] == params['offset']:
-                    #again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000)
+                    # again the same XML, this wiki does not support params in
+                    # Special:Export, offer complete XML up to X edits (usually
+                    # 1000)
                     print 'ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated'
                     truncated = True
                     break
@@ -421,47 +478,53 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
                           <timestamp>2011-03-09T19:57:06Z</timestamp>
                           <contributor>
                     """
-                    #offset is OK in this wiki, merge with the previous chunk of this page history and continue
-                    xml = xml.split('</page>')[0] + '    <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
+                    # offset is OK in this wiki, merge with the previous chunk
+                    # of this page history and continue
+                    xml = xml.split(
+                        '</page>')[0] + '    <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
             else:
-                params['offset'] = '' #no more edits in this page history
-    
+                params['offset'] = ''  # no more edits in this page history
+
     if verbose:
         numberofedits = len(re.findall(r_timestamp, xml))
         if (numberofedits == 1):
             print '    %s, 1 edit' % (title)
         else:
             print '    %s, %d edits' % (title, numberofedits)
-    
+
     return xml
 
+
 def cleanXML(xml=''):
     """ Trim redundant info """
-    #do not touch XML codification, leave AS IS
+    # do not touch XML codification, leave AS IS
     if re.search(r'</siteinfo>\n', xml) and re.search(r'</mediawiki>', xml):
         xml = xml.split('</siteinfo>\n')[1]
         xml = xml.split('</mediawiki>')[0]
     return xml
 
+
 def generateXMLDump(config={}, titles=[], start='', session=None):
     """ Generates a XML dump for a list of titles """
-    
+
     print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
     header = getXMLHeader(config=config, session=session)
-    footer = '</mediawiki>\n' #new line at the end
-    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history')
+    footer = '</mediawiki>\n'  # new line at the end
+    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
+                                    config['date'], config['curonly'] and 'current' or 'history')
     xmlfile = ''
     lock = True
     if start:
-        #remove the last chunk of xml dump (it is probably incomplete)
+        # remove the last chunk of xml dump (it is probably incomplete)
         xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r')
         xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w')
         prev = ''
         c = 0
         for l in xmlfile:
-            #removing <page>\n until end of file
-            if c != 0: #lock to avoid write an empty line at the begining of file
-                if not re.search(r'<title>%s</title>' % (start), l): 
+            # removing <page>\n until end of file
+            # lock to avoid write an empty line at the begining of file
+            if c != 0:
+                if not re.search(r'<title>%s</title>' % (start), l):
                     xmlfile2.write(prev)
                 else:
                     break
@@ -469,22 +532,25 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
             prev = l
         xmlfile.close()
         xmlfile2.close()
-        #subst xml with xml2
-        os.remove('%s/%s' % (config['path'], xmlfilename)) #remove previous xml dump
-        os.rename('%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) #move correctly truncated dump to its real name
+        # subst xml with xml2
+        # remove previous xml dump
+        os.remove('%s/%s' % (config['path'], xmlfilename))
+        # move correctly truncated dump to its real name
+        os.rename(
+            '%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename))
     else:
-        #requested complete xml dump
+        # requested complete xml dump
         lock = False
         xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
         xmlfile.write(header.encode('utf-8'))
         xmlfile.close()
-    
+
     xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
     c = 1
     for title in titles:
         if not title.strip():
             continue
-        if title == start: #start downloading from start, included
+        if title == start:  # start downloading from start, included
             lock = False
         if lock:
             continue
@@ -494,78 +560,111 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
         xml = getXMLPage(config=config, title=title, session=session)
         xml = cleanXML(xml=xml)
         if not xml:
-            logerror(config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title))
-        #here, XML is a correct <page> </page> chunk or 
-        #an empty string due to a deleted page (logged in errors log) or
-        #an empty string due to an error while retrieving the page from server (logged in errors log)
+            logerror(
+                config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title))
+        # here, XML is a correct <page> </page> chunk or
+        # an empty string due to a deleted page (logged in errors log) or
+        # an empty string due to an error while retrieving the page from server
+        # (logged in errors log)
         xmlfile.write(xml.encode('utf-8'))
         c += 1
     xmlfile.write(footer)
     xmlfile.close()
     print 'XML dump saved at...', xmlfilename
 
+
 def saveTitles(config={}, titles=[]):
     """ Save title list in a file """
 
-    titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date'])
+    titlesfilename = '%s-%s-titles.txt' % (
+        domain2prefix(config=config), config['date'])
     titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
     output = u"%s\n--END--" % ('\n'.join(titles))
     titlesfile.write(output.encode('utf-8'))
     titlesfile.close()
-    
+
     print 'Titles saved at...', titlesfilename
 
+
 def saveImageFilenamesURL(config={}, images=[], session=None):
     """ Save image list in a file, including filename, url and uploader """
 
-    imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
+    imagesfilename = '%s-%s-images.txt' % (
+        domain2prefix(config=config), config['date'])
     imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
-    imagesfile.write(('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]).encode('utf-8')))
+    imagesfile.write(('\n'.join(['%s\t%s\t%s' % (
+        filename, url, uploader) for filename, url, uploader in images]).encode('utf-8')))
     imagesfile.write('\n--END--')
     imagesfile.close()
-    
+
     print 'Image filenames and URLs saved at...', imagesfilename
 
+
 def getImageFilenamesURL(config={}, session=None):
     """ Retrieve file list: filename, url, uploader """
-    
+
     print 'Retrieving image filenames'
-    r_next = r'(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;' # (?<! http://docs.python.org/library/re.html
+    # (?<! http://docs.python.org/library/re.html
+    r_next = r'(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;'
     images = []
-    offset = '29990101000000' #january 1, 2999
+    offset = '29990101000000'  # january 1, 2999
     limit = 5000
     retries = 5
     while offset:
-        #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
-        r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset})
+        # 5000 overload some servers, but it is needed for sites like this with
+        # no next links
+        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
+        r = session.post(url=config['index'], data={
+                         'title': 'Special:Imagelist', 'limit': limit, 'offset': offset})
         raw = r.text
         delay(config=config, session=session)
-        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
+        # delicate wiki
+        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw):
             if limit > 10:
                 print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
-                limit = limit/10
+                limit = limit / 10
                 continue
-            elif retries > 0: # waste retries, then exit
+            elif retries > 0:  # waste retries, then exit
                 retries -= 1
                 print 'Retrying...'
                 continue
             else:
                 print 'No more retries, exit...'
                 break
-        
+
         raw = cleanHTML(raw)
-        #archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
-        #wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1" class="new" title="Usuario:Fernandocg (página no existe)">Fernandocg</a></td>
+        # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
+        # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
+        # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
+        # class="new" title="Usuario:Fernandocg (página no
+        # existe)">Fernandocg</a></td>
         r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
-        #wikijuegos 1.9.5 http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old mediawiki version
+        # wikijuegos 1.9.5
+        # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
+        # mediawiki version
         r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
-        #gentoowiki 1.18 <tr><td class="TablePager_col_img_timestamp">18:15, 3 April 2011</td><td class="TablePager_col_img_name"><a href="/wiki/File:Asus_eeepc-1201nl.png" title="File:Asus eeepc-1201nl.png">Asus eeepc-1201nl.png</a> (<a href="/w/images/2/2b/Asus_eeepc-1201nl.png">file</a>)</td><td class="TablePager_col_thumb"><a href="/wiki/File:Asus_eeepc-1201nl.png" class="image"><img alt="" src="/w/images/thumb/2/2b/Asus_eeepc-1201nl.png/180px-Asus_eeepc-1201nl.png" width="180" height="225" /></a></td><td class="TablePager_col_img_size">37 KB</td><td class="TablePager_col_img_user_text"><a href="/w/index.php?title=User:Yannails&amp;action=edit&amp;redlink=1" class="new" title="User:Yannails (page does not exist)">Yannails</a></td><td class="TablePager_col_img_description">&#160;</td><td class="TablePager_col_count">1</td></tr>
+        # gentoowiki 1.18 <tr><td class="TablePager_col_img_timestamp">18:15, 3
+        # April 2011</td><td class="TablePager_col_img_name"><a
+        # href="/wiki/File:Asus_eeepc-1201nl.png" title="File:Asus
+        # eeepc-1201nl.png">Asus eeepc-1201nl.png</a> (<a
+        # href="/w/images/2/2b/Asus_eeepc-1201nl.png">file</a>)</td><td
+        # class="TablePager_col_thumb"><a
+        # href="/wiki/File:Asus_eeepc-1201nl.png" class="image"><img alt=""
+        # src="/w/images/thumb/2/2b/Asus_eeepc-1201nl.png/180px-Asus_eeepc-1201nl.png"
+        # width="180" height="225" /></a></td><td
+        # class="TablePager_col_img_size">37 KB</td><td
+        # class="TablePager_col_img_user_text"><a
+        # href="/w/index.php?title=User:Yannails&amp;action=edit&amp;redlink=1"
+        # class="new" title="User:Yannails (page does not
+        # exist)">Yannails</a></td><td
+        # class="TablePager_col_img_description">&#160;</td><td
+        # class="TablePager_col_count">1</td></tr>
         r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
-        #http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
-        #(<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
+        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
+        # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
         r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
         m = []
-        #different mediawiki versions
+        # different mediawiki versions
         if re.search(r_images1, raw):
             m = re.compile(r_images1).finditer(raw)
         elif re.search(r_images2, raw):
@@ -574,16 +673,22 @@ def getImageFilenamesURL(config={}, session=None):
             m = re.compile(r_images3).finditer(raw)
         elif re.search(r_images4, raw):
             m = re.compile(r_images4).finditer(raw)
-        
+
         for i in m:
             url = i.group('url')
-            if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
-                if url[0] == '/': #slash is added later
+            # is it a relative URL?
+            if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')):
+                if url[0] == '/':  # slash is added later
                     url = url[1:]
-                domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
-                url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
+                # remove from :// (http or https) until the first / after
+                # domain
+                domainalone = config['index'].split('://')[1].split('/')[0]
+                # concat http(s) + domain + relative url
+                url = u'%s://%s/%s' % (config['index'].split('://')
+                                       [0], domainalone, url)
             url = undoHTMLEntities(text=url)
-            #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
+            # url = urllib.unquote(url) #do not use unquote with url, it break
+            # some urls with odd chars
             url = re.sub(' ', '_', url)
             filename = re.sub('_', ' ', i.group('filename'))
             filename = undoHTMLEntities(text=filename)
@@ -592,54 +697,63 @@ def getImageFilenamesURL(config={}, session=None):
             uploader = undoHTMLEntities(text=uploader)
             uploader = urllib.unquote(uploader)
             images.append([filename, url, uploader])
-            #print filename, url
-        
+            # print filename, url
+
         if re.search(r_next, raw):
             offset = re.findall(r_next, raw)[0]
-            retries += 5 # add more retries if we got a page with offset
+            retries += 5  # add more retries if we got a page with offset
         else:
             offset = ''
-    
+
     if (len(images) == 1):
         print '    Found 1 image'
     else:
         print '    Found %d images' % (len(images))
-    
+
     images.sort()
     return images
 
+
 def getImageFilenamesURLAPI(config={}, session=None):
     """ Retrieve file list: filename, url, uploader """
-    
+
     print 'Retrieving image filenames'
     aifrom = '!'
     images = []
     while aifrom:
-        sys.stderr.write('.') #progress
-        params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
-        #FIXME Handle HTTP Errors HERE
+        sys.stderr.write('.')  # progress
+        params = {'action': 'query', 'list': 'allimages', 'aiprop':
+                  'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
+        # FIXME Handle HTTP Errors HERE
         r = session.post(url=config['api'], data=params)
         handleStatusCode(r)
         jsonimages = json.loads(r.text)
         delay(config=config, session=session)
         aifrom = ''
-        if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'):
-            if jsonimages['query-continue']['allimages'].has_key('aicontinue'):
-                aifrom = jsonimages['query-continue']['allimages']['aicontinue'] 
-            elif jsonimages['query-continue']['allimages'].has_key('aifrom'):
+        if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']:
+            if 'aicontinue' in jsonimages['query-continue']['allimages']:
+                aifrom = jsonimages['query-continue']['allimages']['aicontinue']
+            elif 'aifrom' in jsonimages['query-continue']['allimages']:
                 aifrom = jsonimages['query-continue']['allimages']['aifrom']
-        #print aifrom
-        
+        # print aifrom
+
         for image in jsonimages['query']['allimages']:
             url = image['url']
-            if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
-                if url[0] == '/': #slash is added later
+            # is it a relative URL?
+            if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')):
+                if url[0] == '/':  # slash is added later
                     url = url[1:]
-                domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
-                url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
+                # remove from :// (http or https) until the first / after
+                # domain
+                domainalone = config['index'].split('://')[1].split('/')[0]
+                # concat http(s) + domain + relative url
+                url = u'%s://%s/%s' % (config['index'].split('://')
+                                       [0], domainalone, url)
             url = re.sub(' ', '_', url)
-            # encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136
-            filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8')
+            # encoding to ascii is needed to work around this horrible bug:
+            # http://bugs.python.org/issue8136
+            filename = unicode(urllib.unquote(
+                (re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')), 'utf-8')
             uploader = re.sub('_', ' ', image['user'])
             images.append([filename, url, uploader])
 
@@ -651,40 +765,45 @@ def getImageFilenamesURLAPI(config={}, session=None):
     images.sort()
     return images
 
+
 def undoHTMLEntities(text=''):
     """ Undo some HTML codes """
-    
-    text = re.sub('&lt;', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp
+
+    # i guess only < > & " ' need conversion
+    # http://www.w3schools.com/html/html_entities.asp
+    text = re.sub('&lt;', '<', text)
     text = re.sub('&gt;', '>', text)
     text = re.sub('&amp;', '&', text)
     text = re.sub('&quot;', '"', text)
     text = re.sub('&#039;', '\'', text)
-    
+
     return text
 
+
 def generateImageDump(config={}, other={}, images=[], start='', session=None):
     """ Save files and descriptions using a file list """
-    
-    #fix use subdirectories md5
+
+    # fix use subdirectories md5
     print 'Retrieving images from "%s"' % (start and start or 'start')
     imagepath = '%s/images' % (config['path'])
     if not os.path.isdir(imagepath):
         print 'Creating "%s" directory' % (imagepath)
         os.makedirs(imagepath)
-    
+
     c = 0
     lock = True
     if not start:
         lock = False
     for filename, url, uploader in images:
-        if filename == start: #start downloading from start (included)
+        if filename == start:  # start downloading from start (included)
             lock = False
         if lock:
             continue
         delay(config=config, session=session)
-        
-        #saving file
-        #truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max)
+
+        # saving file
+        # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
+        # limit). Later .desc is added to filename, so better 100 as max)
         filename2 = urllib.unquote(filename)
         if len(filename2) > other['filenamelimit']:
             # split last . (extension) and then merge
@@ -695,11 +814,13 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
         r = requests.get(url=url)
         imagefile.write(r.content)
         imagefile.close()
-        #saving description if any
-        xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility
+        # saving description if any
+        xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (
+            filename), session=session)  # use Image: for backwards compatibility
         f = open('%s/%s.desc' % (imagepath, filename2), 'w')
-        if not re.search(r'</mediawiki>', xmlfiledesc): #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
-            #failure when retrieving desc? then save it as empty .desc
+        # <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
+        if not re.search(r'</mediawiki>', xmlfiledesc):
+            # failure when retrieving desc? then save it as empty .desc
             xmlfiledesc = ''
         f.write(xmlfiledesc.encode('utf-8'))
         f.close()
@@ -707,12 +828,13 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
         c += 1
         if c % 10 == 0:
             print '    Downloaded %d images' % (c)
-    
+
     print 'Downloaded %d images' % (c)
-    
+
+
 def saveLogs(config={}, session=None):
     """ Save Special:Log """
-    #get all logs from Special:Log
+    # get all logs from Special:Log
     """parse
     <select name='type'>
     <option value="block">Bloqueos de usuarios</option>
@@ -730,9 +852,10 @@ def saveLogs(config={}, session=None):
     """
     delay(config=config, session=session)
 
+
 def domain2prefix(config={}, session=None):
     """ Convert domain name to a valid prefix filename. """
-    
+
     # At this point, both api and index are supposed to be defined
     domain = ''
     if config['api']:
@@ -745,40 +868,43 @@ def domain2prefix(config={}, session=None):
     domain = re.sub(r'/', '_', domain)
     domain = re.sub(r'\.', '', domain)
     domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
-    
+
     return domain
 
+
 def loadConfig(config={}, configfilename=''):
     """ Load config file """
-    
+
     try:
         with open('%s/%s' % (config['path'], configfilename), 'r') as infile:
             config = cPickle.load(infile)
     except:
         print 'There is no config file. we can\'t resume. Start a new dump.'
         sys.exit()
-    
+
     return config
 
+
 def saveConfig(config={}, configfilename=''):
     """ Save config file """
-    
+
     with open('%s/%s' % (config['path'], configfilename), 'w') as outfile:
         cPickle.dump(config, outfile)
-    
+
+
 def welcome():
     message = ''
     """ Opening message """
-    message += "#"*73
+    message += "#" * 73
     message += """
 # Welcome to DumpGenerator %s by WikiTeam (GPL v3)                   #
 # More info at: https://github.com/WikiTeam/wikiteam                    #""" % (getVersion())
     message += "\n"
-    message += "#"*73
+    message += "#" * 73
     message += "\n"
     message += ''
-    message += "\n" 
-    message += "#"*73
+    message += "\n"
+    message += "#" * 73
     message += """
 # Copyright (C) 2011-2014 WikiTeam                                      #
 # This program is free software: you can redistribute it and/or modify  #
@@ -794,12 +920,13 @@ def welcome():
 # You should have received a copy of the GNU General Public License     #
 # along with this program.  If not, see <http://www.gnu.org/licenses/>. #"""
     message += "\n"
-    message += "#"*73
+    message += "#" * 73
     message += "\n"
     message += ''
-    
+
     return message
 
+
 def bye():
     """ Closing message """
     print "---> Congratulations! Your dump is complete <---"
@@ -813,55 +940,69 @@ def getParameters(params=[]):
         params = sys.argv
 
     parser = argparse.ArgumentParser(description='')
-    
-    parser.add_argument('-v', '--version', action='version', version=getVersion())
-    parser.add_argument('--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
-    parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)")
-    parser.add_argument('--retries', metavar=5, default=5, help="Maximum number of retries for ")
-    parser.add_argument('--get-wiki-engine', action='store_true', help="returns the wiki engine")
+
+    parser.add_argument(
+        '-v', '--version', action='version', version=getVersion())
+    parser.add_argument(
+        '--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
+    parser.add_argument(
+        '--delay', metavar=5, default=0, help="adds a delay (in seconds)")
+    parser.add_argument(
+        '--retries', metavar=5, default=5, help="Maximum number of retries for ")
+    parser.add_argument(
+        '--get-wiki-engine', action='store_true', help="returns the wiki engine")
 
     groupWikiOrAPIOrIndex = parser.add_mutually_exclusive_group(required=True)
-    groupWikiOrAPIOrIndex.add_argument('wiki', default='', nargs='?', help="URL to wiki")
+    groupWikiOrAPIOrIndex.add_argument(
+        'wiki', default='', nargs='?', help="URL to wiki")
     groupWikiOrAPIOrIndex.add_argument('--api', help="URL to api.php")
     groupWikiOrAPIOrIndex.add_argument('--index', help="URL to index.php")
-    
+
     groupXMLOrImages = parser.add_argument_group()
-    groupXMLOrImages.add_argument('--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)")
-    parser.add_argument('--curonly', action='store_true', help='store only the current version of pages')
+    groupXMLOrImages.add_argument(
+        '--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)")
+    parser.add_argument('--curonly', action='store_true',
+                        help='store only the current version of pages')
+
+    groupXMLOrImages.add_argument(
+        '--images', action='store_true', help="generates an image dump")
 
-    groupXMLOrImages.add_argument('--images', action='store_true', help="generates an image dump")
-    
     parser.add_argument('--path', help='path to store wiki dump at')
-    parser.add_argument('--resume', action='store_true', help='resumes previous incomplete dump (requires --path)')
+    parser.add_argument('--resume', action='store_true',
+                        help='resumes previous incomplete dump (requires --path)')
     parser.add_argument('--force', action='store_true', help='')
-    parser.add_argument('--namespaces', metavar="1,2,3", help='comma-separated value of namespaces to include (all by default)')
-    parser.add_argument('--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude')
-    
-    parser.add_argument('--user', help='Username if authentication is required.')
-    parser.add_argument('--pass', dest='password', help='Password if authentication is required.')
-    
+    parser.add_argument('--namespaces', metavar="1,2,3",
+                        help='comma-separated value of namespaces to include (all by default)')
+    parser.add_argument('--exnamespaces', metavar="1,2,3",
+                        help='comma-separated value of namespaces to exclude')
+
+    parser.add_argument(
+        '--user', help='Username if authentication is required.')
+    parser.add_argument(
+        '--pass', dest='password', help='Password if authentication is required.')
+
     args = parser.parse_args()
-    #print args
-    
+    # print args
+
     # Execute excluding args
     if args.get_wiki_engine and args.wiki and (args.wiki.startswith('http://') or args.wiki.startswith('https://')):
         print getWikiEngine(url=args.wiki)
         sys.exit()
     # End execute excluding args
-    
+
     # check API URL
     if args.api and (not args.api.startswith('http://') and not args.api.startswith('https://')):
         print args.api
         print 'ERROR: URL to api.php must start with http:// or https://\n'
         parser.print_usage()
         sys.exit(1)
-        
+
     # check index URL
     if args.index and (not args.index.startswith('http://') and not args.index.startswith('https://')):
         print 'ERROR: URL to index.php must start with http:// or https://\n'
         parser.print_usage()
         sys.exit(1)
-        
+
     # check user and pass (one requires both)
     if (args.user and not args.password) or (args.password and not args.user):
         print 'Both --user and --pass are required for authentication.'
@@ -872,7 +1013,8 @@ def getParameters(params=[]):
     exnamespaces = []
     # Process namespace inclusions
     if args.namespaces:
-        if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': #fix, why - ?  and... --namespaces= all with a space works?
+        # fix, why - ?  and... --namespaces= all with a space works?
+        if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all':
             print "Invalid namespace values.\nValid format is integer(s) separated by commas"
             sys.exit()
         else:
@@ -900,11 +1042,13 @@ def getParameters(params=[]):
         print "--curonly requires --xml\n"
         parser.print_usage()
         sys.exit(1)
-        
-    #user chose --api, but --index it is necessary for special:export: we generate it
+
+    # user chose --api, but --index it is necessary for special:export: we
+    # generate it
     if args.api and not args.index:
         index = args.api.split('api.php')[0] + 'index.php'
-        # WARNING: remove index.php here for misconfigured sites like editthis.info, or provide --index directly
+        # WARNING: remove index.php here for misconfigured sites like
+        # editthis.info, or provide --index directly
         print 'You didn\'t provide a path for index.php, using ', index
     else:
         index = args.index
@@ -919,7 +1063,7 @@ def getParameters(params=[]):
     session.headers = {'User-Agent': getUserAgent()}
     if args.user and args.password:
         session.auth = (args.user, args.password)
-    #session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
+    # session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
 
     config = {
         'curonly': args.curonly,
@@ -937,74 +1081,81 @@ def getParameters(params=[]):
     }
     other = {
         'resume': args.resume,
-        'filenamelimit': 100, #do not change
+        'filenamelimit': 100,  # do not change
         'force': args.force,
         'session': session
     }
-        
+
     if config['api']:
-        #check api.php
+        # check api.php
         if checkAPI(config['api'], config, session=other['session']):
             print 'api.php is OK'
         else:
             print 'Error in api.php, please, provide a correct path to api.php'
             sys.exit()
-    
+
     if config['index']:
-        #check index.php
+        # check index.php
         if checkIndexphp(config['index'], config, session=other['session']):
             print 'index.php is OK'
         else:
             print 'Error in index.php, please, provide a correct path to index.php'
             sys.exit()
-    
-    #calculating path, if not defined by user with --path=
+
+    # calculating path, if not defined by user with --path=
     if not config['path']:
         config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config, session=session), config['date'])
 
     return config, other
-    
+
+
 def checkAPI(api, config={}, session=None):
     """ Checking API availability """
     global cj
-    r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
+    r = session.post(
+        url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
     resultText = r.text
     print 'Checking api.php...', api
     if "MediaWiki API is not enabled for this site." in resultText:
         return False
     result = json.loads(resultText)
     delay(config=config, session=session)
-    if result.has_key('query'):
+    if 'query' in result:
         return True
     return False
 
+
 def checkIndexphp(indexphp, config={}, session=None):
     """ Checking index.php availability """
     r = session.post(url=indexphp, data={'title': 'Special:Version'})
     raw = r.text
     delay(config=config, session=session)
     print 'Checking index.php...', indexphp
-    if re.search(r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)', raw) and not config['cookies']: # Workaround for issue 71
-         print "ERROR: This wiki requires login and we are not authenticated"
-         return False
+    # Workaround for issue 71
+    if re.search(r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)', raw) and not config['cookies']:
+        print "ERROR: This wiki requires login and we are not authenticated"
+        return False
     if re.search(r'(This wiki is powered by|<h2 id="mw-version-license">|meta name="generator" content="MediaWiki)', raw):
         return True
     return False
 
+
 def removeIP(raw=''):
     """ Remove IP from HTML comments <!-- --> """
-    
+
     raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
-    #http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
-    #weird cases as :: are not included
-    raw = re.sub(r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw)
-    
+    # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
+    # weird cases as :: are not included
+    raw = re.sub(
+        r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw)
+
     return raw
 
-def checkXMLIntegrity(config={}, session=None):
+
+def checkXMLIntegrity(config={}, titles=[], session=None):
     """ Check XML dump integrity, to detect broken XML chunks """
-    return 
-    
+    return
+
     print 'Verifying dump...'
     checktitles = 0
     checkpageopen = 0
@@ -1032,10 +1183,10 @@ def checkXMLIntegrity(config={}, session=None):
         while reply.lower() not in ['yes', 'y', 'no', 'n']:
             reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
         if reply.lower() in ['yes', 'y']:
-            generateXMLDump(config=config, titles=titles)
+            generateXMLDump(config=config, titles=titles, session=session)
         elif reply.lower() in ['no', 'n']:
             print 'Not generating a new dump.'
-        
+
 
 def createNewDump(config={}, other={}):
     titles = []
@@ -1045,77 +1196,89 @@ def createNewDump(config={}, other={}):
         titles += getPageTitles(config=config, session=other['session'])
         saveTitles(config=config, titles=titles)
         generateXMLDump(config=config, titles=titles, session=other['session'])
-        checkXMLIntegrity(config=config)
+        checkXMLIntegrity(config=config, titles=titles, session=other['session'])
     if config['images']:
         if config['api']:
-            images += getImageFilenamesURLAPI(config=config, session=other['session'])
+            images += getImageFilenamesURLAPI(config=config,
+                                              session=other['session'])
         else:
-            images += getImageFilenamesURL(config=config, session=other['session'])
-        saveImageFilenamesURL(config=config, images=images, session=other['session'])
-        generateImageDump(config=config, other=other, images=images, session=other['session'])
+            images += getImageFilenamesURL(config=config,
+                                           session=other['session'])
+        saveImageFilenamesURL(
+            config=config, images=images, session=other['session'])
+        generateImageDump(
+            config=config, other=other, images=images, session=other['session'])
     if config['logs']:
-        saveLogs(config=config, session=session)
+        saveLogs(config=config, session=other['session'])
+
 
 def resumePreviousDump(config={}, other={}):
     titles = []
     images = []
     print 'Resuming previous dump process...'
     if config['xml']:
-        #load titles
+        # load titles
         lasttitle = ''
         try:
-            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=other['session']), config['date']), 'r')
+            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(
+                config=config, session=other['session']), config['date']), 'r')
             raw = unicode(f.read(), 'utf-8')
             titles = raw.split('\n')
             lasttitle = titles[-1]
-            if not lasttitle: #empty line at EOF ?
+            if not lasttitle:  # empty line at EOF ?
                 lasttitle = titles[-2]
             f.close()
         except:
-            pass #probably file doesnot exists
+            pass  # probably file doesnot exists
         if lasttitle == '--END--':
-            #titles list is complete
+            # titles list is complete
             print 'Title list was completed in the previous session'
         else:
             print 'Title list is incomplete. Reloading...'
-            #do not resume, reload, to avoid inconsistences, deleted pages or so
+            # do not resume, reload, to avoid inconsistences, deleted pages or
+            # so
             titles = getPageTitles(config=config, session=other['session'])
             saveTitles(config=config, titles=titles)
-        #checking xml dump
+        # checking xml dump
         xmliscomplete = False
         lastxmltitle = ''
         try:
-            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), 'r')
+            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other[
+                     'session']), config['date'], config['curonly'] and 'current' or 'history'), 'r')
             for l in f:
                 if re.findall('</mediawiki>', l):
-                    #xml dump is complete
+                    # xml dump is complete
                     xmliscomplete = True
                     break
-                xmltitles = re.findall(r'<title>([^<]+)</title>', l) #weird if found more than 1, but maybe
+                # weird if found more than 1, but maybe
+                xmltitles = re.findall(r'<title>([^<]+)</title>', l)
                 if xmltitles:
                     lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
             f.close()
         except:
-            pass #probably file doesnot exists
-        #removing --END-- before getXMLs
+            pass  # probably file doesnot exists
+        # removing --END-- before getXMLs
         while titles and titles[-1] in ['', '--END--']:
             titles = titles[:-1]
         if xmliscomplete:
             print 'XML dump was completed in the previous session'
         elif lastxmltitle:
-            #resuming...
+            # resuming...
             print 'Resuming XML dump from "%s"' % (lastxmltitle)
-            generateXMLDump(config=config, titles=titles, start=lastxmltitle, session=other['session'])
+            generateXMLDump(
+                config=config, titles=titles, start=lastxmltitle, session=other['session'])
         else:
-            #corrupt? only has XML header?
+            # corrupt? only has XML header?
             print 'XML is corrupt? Regenerating...'
-            generateXMLDump(config=config, titles=titles, session=other['session'])
-    
+            generateXMLDump(
+                config=config, titles=titles, session=other['session'])
+
     if config['images']:
-        #load images
+        # load images
         lastimage = ''
         try:
-            f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
+            f = open('%s/%s-%s-images.txt' %
+                     (config['path'], domain2prefix(config=config), config['date']), 'r')
             raw = unicode(f.read(), 'utf-8').strip()
             lines = raw.split('\n')
             for l in lines:
@@ -1124,23 +1287,26 @@ def resumePreviousDump(config={}, other={}):
             lastimage = lines[-1]
             f.close()
         except:
-            pass #probably file doesnot exists
+            pass  # probably file doesnot exists
         if lastimage == u'--END--':
             print 'Image list was completed in the previous session'
         else:
             print 'Image list is incomplete. Reloading...'
-            #do not resume, reload, to avoid inconsistences, deleted images or so
+            # do not resume, reload, to avoid inconsistences, deleted images or
+            # so
             if config['api']:
-                images=getImageFilenamesURLAPI(config=config, session=other['session'])
+                images = getImageFilenamesURLAPI(
+                    config=config, session=other['session'])
             else:
-                images = getImageFilenamesURL(config=config, session=other['session'])
+                images = getImageFilenamesURL(
+                    config=config, session=other['session'])
             saveImageFilenamesURL(config=config, images=images)
-        #checking images directory
+        # checking images directory
         listdir = []
         try:
             listdir = os.listdir('%s/images' % (config['path']))
         except:
-            pass #probably directory does not exist
+            pass  # probably directory does not exist
         listdir.sort()
         complete = True
         lastfilename = ''
@@ -1148,42 +1314,49 @@ def resumePreviousDump(config={}, other={}):
         c = 0
         for filename, url, uploader in images:
             lastfilename2 = lastfilename
-            lastfilename = filename #return always the complete filename, not the truncated
+            # return always the complete filename, not the truncated
+            lastfilename = filename
             filename2 = filename
             if len(filename2) > other['filenamelimit']:
                 filename2 = truncateFilename(other=other, filename=filename2)
             if filename2 not in listdir:
                 complete = False
                 break
-            c +=1
+            c += 1
         print '%d images were found in the directory from a previous session' % (c)
         if complete:
-            #image dump is complete
+            # image dump is complete
             print 'Image dump was completed in the previous session'
         else:
-            generateImageDump(config=config, other=other, images=images, start=lastfilename2, session=other['session']) # we resume from previous image, which may be corrupted (or missing .desc)  by the previous session ctrl-c or abort
-    
+            # we resume from previous image, which may be corrupted (or missing
+            # .desc)  by the previous session ctrl-c or abort
+            generateImageDump(
+                config=config, other=other, images=images, start=lastfilename2, session=other['session'])
+
     if config['logs']:
-        #fix
+        # fix
         pass
 
+
 def saveSpecialVersion(config={}, session=None):
     """ Save Special:Version as .html, to preserve extensions details """
-    
+
     if os.path.exists('%s/Special:Version.html' % (config['path'])):
         print 'Special:Version.html exists, do not overwrite'
     else:
         print 'Downloading Special:Version with extensions and other related info'
-        r = session.post(url=config['index'], data={'title': 'Special:Version'})
+        r = session.post(
+            url=config['index'], data={'title': 'Special:Version'})
         raw = r.text
         delay(config=config, session=session)
         raw = removeIP(raw=raw)
         with open('%s/Special:Version.html' % (config['path']), 'w') as outfile:
             outfile.write(raw.encode('utf-8'))
 
+
 def saveIndexPHP(config={}, session=None):
     """ Save index.php as .html, to preserve license details available at the botom of the page """
-    
+
     if os.path.exists('%s/index.html' % (config['path'])):
         print 'index.html exists, do not overwrite'
     else:
@@ -1195,39 +1368,43 @@ def saveIndexPHP(config={}, session=None):
         with open('%s/index.html' % (config['path']), 'w') as outfile:
             outfile.write(raw.encode('utf-8'))
 
+
 def saveSiteInfo(config={}, session=None):
     """ Save a file with site info """
-    
+
     if config['api']:
         if os.path.exists('%s/siteinfo.json' % (config['path'])):
             print 'siteinfo.json exists, do not overwrite'
         else:
             print 'Downloading site info as siteinfo.json'
-            r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
+            r = session.post(url=config['api'], data={
+                             'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
             result = json.loads(r.text)
             delay(config=config, session=session)
             with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
                 outfile.write(json.dumps(result, indent=4, sort_keys=True))
 
+
 def avoidWikimediaProjects(config={}, other={}):
     """ Skip Wikimedia projects and redirect to the dumps website """
-    
-    #notice about wikipedia dumps
-    if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']):
+
+    # notice about wikipedia dumps
+    if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api'] + config['index']):
         print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
         print 'Download the dumps from http://dumps.wikimedia.org'
         if not other['force']:
             print 'Thanks!'
             sys.exit()
 
+
 def getWikiEngine(url=''):
     """ Returns the wiki engine of a URL, if known """
-    
+
     session = requests.Session()
     session.headers = {'User-Agent': getUserAgent()}
     r = session.post(url=url)
     result = r.text
-    
+
     wikiengine = 'Unknown'
     if re.search(ur'(?im)(<meta name="generator" content="DokuWiki)', result):
         wikiengine = 'DokuWiki'
@@ -1235,27 +1412,31 @@ def getWikiEngine(url=''):
         wikiengine = 'MediaWiki'
     elif re.search(ur'(?im)(>MoinMoin Powered</a>)', result):
         wikiengine = 'MoinMoin'
-    
+
     return wikiengine
 
+
 def main(params=[]):
     """ Main function """
-    
+
     configfilename = 'config.txt'
     config, other = getParameters(params=params)
     avoidWikimediaProjects(config=config, other=other)
-    
+
     print welcome()
     print 'Analysing %s' % (config['api'] and config['api'] or config['index'])
-    
-    #creating path or resuming if desired
+
+    # creating path or resuming if desired
     c = 2
-    originalpath = config['path'] # to avoid concat blabla-2, blabla-2-3, and so on...
-    while not other['resume'] and os.path.isdir(config['path']): #do not enter if resume is requested from begining
+    # to avoid concat blabla-2, blabla-2-3, and so on...
+    originalpath = config['path']
+    # do not enter if resume is requested from begining
+    while not other['resume'] and os.path.isdir(config['path']):
         print '\nWarning!: "%s" path exists' % (config['path'])
         reply = ''
         while reply.lower() not in ['yes', 'y', 'no', 'n']:
-            reply = raw_input('There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % (config['path'], config['path'], configfilename))
+            reply = raw_input('There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % (
+                config['path'], config['path'], configfilename))
         if reply.lower() in ['yes', 'y']:
             if not os.path.isfile('%s/%s' % (config['path'], configfilename)):
                 print 'No config file found. I can\'t resume. Aborting.'
@@ -1276,15 +1457,15 @@ def main(params=[]):
     else:
         os.mkdir(config['path'])
         saveConfig(config=config, configfilename=configfilename)
-    
+
     if other['resume']:
         resumePreviousDump(config=config, other=other)
     else:
         createNewDump(config=config, other=other)
 
-    saveIndexPHP(config=config, session=session)
-    saveSpecialVersion(config=config, session=session)
-    saveSiteInfo(config=config, session=session)
+    saveIndexPHP(config=config, session=other['session'])
+    saveSpecialVersion(config=config, session=other['session'])
+    saveSiteInfo(config=config, session=other['session'])
     bye()
 
 if __name__ == "__main__":