wikiteam/rewrite/dumpgenerator.py

# -*- coding: utf-8 -*-

# Copyright (C) 2013 Hydriz Scholz
# Copyright (C) 2014 WikiTeam
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit
# <http://www.gnu.org/copyleft/gpl.html>

#######################################################################
# dumpgenerator.py is a script to generate backups of MediaWiki wikis #
# To learn more, read the documentation:							  #
#		http://code.google.com/p/wikiteam/wiki/NewTutorial 			  #
#######################################################################

# For developers:
# * All functions and classes are displayed in alphabetical order for easier accessibility.
# * Script exit codes reference:
#  * 0 - Script ran well without problems
#  * 1 - Script failed due to user's incorrect use
#  * 2 - Script failed due to destination server issue
# * For testing purposes, add the --debug parameter and edit DumpGenerator.debug() accordingly.

######
# TODO LIST
# 0. Download index.html and Special:Version.html
# 1. Index.php support.
# 2. Special:Log pages support
# 3. GUI (Question and Answer if no parameters are given)
# 4. Resuming of dump
# 5. Place the images in various folders so as to avoid hitting the limit of number of files in a directory
# 6. Speed up the script. A run with --xml --images on test.wikidata.org came up with 9 min 23 sec on 2.0 and 3 min 58 sec on 1.0

# WHAT IS WORKING
# 1. XML dumping
# 2. Complete dumping using API (except for --logs)
# 3. Automatic updating
# 4. Dumping of XML based on a list of titles
# 5. Integrity check for XML dump

import datetime
import getopt
import hashlib
import json
import os
import re
import shutil
import sys
import time
import urllib
import urllib2
import xml.etree.ElementTree as ElementTree

class DumpGenerator:
	"""
	The main class that powers and operates everything else
	"""
	def __init__(self):
		"""
		Main constructor class for DumpGenerator, registers important variables too.
		"""
		self.Version = "2.0"
		self.revision = "1"
		# Provide a cool user-agent to hide the fact that this is a script
		self.UserAgent = "Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0"
		self.useAPI = False
		self.useIndex = False
		self.prefix = ""
		self.domain = ""
		self.tasklist = []
		self.configfile = "config.json"
		self.configoptions = {
			"date": "",
			"useAPI": False,
			"useIndex": False,
			"urltoapi": "",
			"urltoindex": "",
			"images": False,
			"logs": False,
			"xml": False,
			"curonly": False,
			"exnamespaces": "",
			"titlesonly": False
		}

		# Basic metadata
		self.date = datetime.datetime.now().strftime('%Y%m%d')

		# Important URLs
		self.urltoapi = ""
		self.urltoindex = ""

		# Type of dump to generate
		self.images = False
		self.logs = False
		self.xml = False

		# Resuming of previous dump
		self.resume = False
		self.path = ""

		# Additional information for XML
		self.curonly = False
		self.exnamespaces = ""
		self.titlesonly = False
		self.titles = ""

		# Others
		self.cookies = ""
		self.delay = 0
		self.debugmode = False
		self.nolog = False
		self.autonomous = False

		# Short options: string (no commas), long options: array
		# More information about these options are at self.help()
		self.shortoptions = "hv"
		self.longoptions = [ "help", "api=", "index=", "curonly", "images", "logs", "xml", "auto", "delay=", "cookies=", "exnamespaces=", "resume", "path=", "debug", "nolog", "titlesonly", "titles=" ]

	def bye(self):
		"""
		Bid farewell to the user at the very end of the script when everything
		has been successful.

		Returns: Goodbye message.
		"""
		message = """---> Congratulations! Your dump is complete <---
If you have suggestions, file a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list
If this is a public wiki, do consider publishing this dump so others can benefit from it. Follow the steps as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam.
Thank you for using DumpGenerator %s by WikiTeam, good bye!""" % ( self.Version )
		return message

	def checkAPI(self):
		"""
		Checks the validity of the api.php.
		"""
		query = {
			"meta": "siteinfo",
			"siprop": "general" }
		sitestats = json.loads( RequestAPI.query( query ) )
		try:
			if ( sitestats[ "query" ][ "general" ][ "server" ] in self.urltoapi ):
				return True
		except:
			try:
				if ( sitestats[ "error" ][ "code" ] == "readapidenied" ) and ( self.cookies == "" ):
					Output.warn( "The wiki is private and we do not have proper authentication information!" )
					return False
			except:
				Output.warn( "This api.php seems weird or is not valid." )
				return False

	def checkIndex(self):
		"""
		Checks the validity of the index.php.
		"""
		# TODO: Screen scraping is involved here, need backward compact for older version of MediaWiki.
		parameters = { "title": "Special:Version" }
		request = RequestIndex.query( parameters )
		# Since we are at Special:Version, we should not be getting Special:BadTitle unless we are not logged in
		if ( re.search( r'(Special:Badtitle</a>)', request ) ) and ( self.cookies == "" ):
			Output.error( "The wiki is private and we do not have proper authentication information!" )
			sys.exit(1)

		# Check for some tags within the Special:Version page, must be language-independent
		if ( re.search( r'(<h2 id="mw-version-license">|meta name="generator" content="MediaWiki)', request ) ):
			return True

	def debug(self):
		"""
		A temporary debug mode for testing purposes.
		REMOVE WHEN COMPLETE!
		"""
		print "DEBUG MODE ON"
		print "Date: %s" % (self.date)
		print "URL to api.php: %s" % (self.urltoapi)
		print "URL to index.php: %s" % (self.urltoindex)
		print "Current revision only: %s" % (self.curonly)
		print "Image dump: %s" % (self.images)
		print "Log dump: %s" % (self.logs)
		print "XML dump: %s" % (self.xml)
		print "Resume: %s" % (self.resume)
		print "Path for resuming: %s" % (self.path)
		print "Delay: %s" % (self.delay)
		print "Cookies file: %s" % (self.cookies)
		print "Excluded namespaces: %s" % (self.exnamespaces)
		print "Debug mode on: %s" % (self.debugmode)
		self.tasklist = sorted( self.tasklist )
		for task in self.tasklist:
			if ( task == "axml" ):
				DumpXML.run()
			elif ( task == "bimages" ):
				DumpImages.run()
			elif ( task == "clogs" ):
				DumpLogs.run()
		sys.exit(0)

	def downloadHtmlPages(self):
		"""
		Downloads the HTML pages such as the main page and Special:Version.
		"""
		# Download the main page
		Output.message( "Downloading index.php (Main Page) as index.html." )
		query = {}
		index = RequestIndex.query( query )
		index = RequestIndex.removeIP( index )
		if ( os.path.exists( "Special:Version.html" ) ):
			os.remove( "index.html" )
		else:
			pass
		for line in index:
			Output.appendToFile( "index.html", line )

		# Download Special:Version or its respective localized version
		Output.message( "Downloading Special:Version with extensions and other related info." )
		query = { "title": "Special:Version" }
		SpecialVersion = RequestIndex.query( query )
		SpecialVersion = RequestIndex.removeIP( SpecialVersion )
		if ( os.path.exists( "Special:Version.html" ) ):
			os.remove( "Special:Version.html" )
		else:
			pass
		for line in SpecialVersion:
			Output.appendToFile( "Special:Version.html", line )

	def fixHTMLEntities(self, text):
		"""
		Convert some HTML entities to their regular characters.
		"""
		text = re.sub('&lt;', '<', text)
		text = re.sub('&gt;', '>', text)
		text = re.sub('&amp;', '&', text)
		text = re.sub('&quot;', '"', text)
		text = re.sub('&#039;', '\'', text)
		return text

	def help(self):
		"""
		Provides vital help information to the user. This function
		directly uses the "print" function because it is harmless and
		what needs to be logged has already been done so.

		Returns: Help message text
		"""
		message = """DumpGenerator %s, a script to generate backups of MediaWiki wikis.
For more information, please see: http://code.google.com/p/wikiteam/wiki/NewTutorial

Startup:
  -h, --help         Displays this help information and exits.
  -v, --version	     Displays the version of this script, with additional credits.

Wiki information:
  --api=URL          The URL to the wiki's api.php, not to be used with --index.
  --index=URL        The URL to the wiki's index.php, not to be used with --api.

Options:
  --xml	             Creates an XML dump.
  --images           Creates an image dump.
  --logs             Creates a dump of all log pages (not yet supported).

XML dump (only if --xml is used):
  --curonly          Download only the current revision.
  --exnamespaces     The unique system number(s) for namespaces to exclude, separated by commas.
  --titlesonly       Download only the page titles without the actual content.
  --titles           Path to a file containing list of titles, requires "--END--" to be on the last line.

Other:
  --auto             Enable auto pilot mode (select options that ensures that the script creates a new dump).
  --resume           Resume an incomplete dump (requires --path to be given).
  --path=PATH        Path to the incomplete dump.
  --delay=SECONDS    Adds a delay (in seconds) between requests.
  --cookies=PATH     Path to a Mozilla cookies.txt file for authentication cookies.
  --nolog            Disable logging to dumpgenerator.log (does not affect output in terminal).

Report any issues to our issue tracker: https://code.google.com/p/wikiteam.""" % (self.Version)
		return message

	def loadConfig(self):
		"""
		Load a config file from a partially-made dump.
		"""
		config = json.loads( self.configfile )
		self.date = config[ "date" ]
		self.useAPI = config[ "useAPI" ]
		self.useIndex = config[ "useIndex" ]
		self.urltoapi = config[ "urltoapi" ]
		self.urltoindex = config[ "urltoindex" ]
		self.images = config[ "images" ]
		self.logs = config[ "logs" ]
		self.xml = config[ "xml" ]
		self.curonly = config[ "curonly" ]
		self.exnamespaces = config[ "exnamespaces" ]
		self.titlesonly = config[ "titlesonly" ]

		if ( self.images == True ):
			self.tasklist.append( "bimage" )
		if ( self.logs == True ):
			self.tasklist.append( "clogs" )
		if ( self.xml == True ):
			self.tasklist.append( "axml" )

		if ( self.useAPI == True ):
			domain = self.urltoapi
		elif ( self.useIndex == True ):
			domain = self.urltoindex

	def makePrefix(self, domain):
		"""
		Converts a domain to a prefix.

		Inputs:
		 - domain: The domain to change, may contain api.php or index.php as suffix.

		Returns:
		 - string with slashes and stray characters changed to underscores, suffix
		   removed and URL protocol removed.
		"""
		domain = domain.lower()
		# Remove unnecessary prefixes and suffixes
		domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
		# Substitute directory slashes with underscores
		domain = re.sub(r'/', '_', domain)
		# Convert any stray character that is not in the alphabet to underscores
		domain = re.sub(r'[^-.A-Za-z0-9]', '_', domain)
		return domain

	def makeNiceURL(self, domain):
		"""
		Converts a domain to a more human-readable format (used for uploading).

		Inputs:
		 - domain: The domain to change, may contain api.php or index.php as suffix.

		Returns:
		 - string with suffix removed.
		"""
		domain = domain.lower()
		# Remove the suffixes
		domain = re.sub(r'(/index\.php|/api\.php)', '', domain)
		return domain

	def processargs(self):
		"""
		Processing arguments and options provided by the user.
		"""
		try:
			options, answers = getopt.getopt( sys.argv[1:], self.shortoptions, self.longoptions )
		except getopt.GetoptError:
			Output.error( "An unknown option has been specified, please check your arguments before re-running!" )
			sys.exit(1)

		# First accept all arguments and store them in a variable
		for option, answer in options:
			# Startup
			if ( option in ( "-h", "--help" ) ):
				# Display the help guide and exit
				print self.help()
				os.remove( Output.logfile )
				sys.exit(0)
			elif ( option in ( "-v", "--version" ) ):
				# Display the version of this script
				print self.version()
				os.remove( Output.logfile )
				sys.exit(0)

			# Wiki information
			elif ( option in "--api" ):
				self.urltoapi = answer
				self.configoptions[ "urltoapi" ] = self.urltoapi
			elif ( option in "--index" ):
				self.urltoindex = answer
				self.configoptions[ "urltoindex" ] = self.urltoindex

			# Dump options
			elif ( option == "--images" ):
				self.images = True
				self.configoptions[ "images" ] = True
				self.tasklist.append( "bimages" )
			elif ( option == "--logs" ):
				self.logs = True
				self.configoptions[ "logs" ] = True
				self.tasklist.append( "clogs" )
			elif ( option == "--xml" ):
				self.xml = True
				self.configoptions[ "xml" ] = True
				self.tasklist.append( "axml" )

			# XML dump options
			elif ( option == "--curonly" ):
				self.curonly = True
				self.configoptions[ "curonly" ] = True
			elif ( option in "--exnamespaces" ):
				self.exnamespaces = answer
				self.configoptions[ "exnamespaces" ] = self.exnamespaces
			elif ( option == "--titlesonly" ):
				self.titlesonly = True
				self.configoptions[ "titlesonly" ] = True
			elif ( option in "--titles" ):
				self.titles = os.path.abspath( answer )

			# Other options
			elif ( option == "--auto" ):
				self.autonomous = True
			elif ( option in "--cookies" ):
				self.cookies = answer
			elif ( option in "--delay" ):
				self.delay = answer
			elif ( option == "--nolog" ):
				self.nolog = True
			elif ( option in "--path" ):
				self.path = answer
			elif ( option == "--resume" ):
				self.resume = True

			# Private options (i.e. usable but not documented in --help)
			elif ( option == "--debug" ):
				self.debugmode = True
			else:
				Output.error( "An unknown option has been specified, please check your arguments before re-running!" )
				sys.exit(1)

		# Now to verify that the user is not messing around
		if ( self.urltoapi == "" and self.urltoindex == "" ):
			# User did not specify either --api= or --index=
			if ( self.resume == True and self.path != "" ):
				# ...but specified --resume and --path= accordingly
				self.resumeDump()
			elif ( self.resume == True and self.path == "" ):
				# ...and specified --resume without --path=
				Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" )
				sys.exit(1)
			else:
				Output.error( "You need to tell me the URL to either the api.php or to index.php!" )
				sys.exit(1)
		elif ( self.resume == True ) and ( self.path == "" ):
			# User specified --resume, but no --path= was given
			Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" )
			sys.exit(1)
		elif ( self.urltoapi != "" and self.urltoindex != "" ):
			# User specified both --api= and --index=
			self.useAPI = True
		elif ( self.xml == False and ( self.curonly == True or self.exnamespaces != "" ) ):
			# User specified --curonly and --exnamespaces without --xml
			Output.error( "You did not specify to make an XML dump using --xml, so why write --curonly or --exnamespaces? Remove them before re-running!" )
			sys.exit(1)

		if ( self.urltoapi != "" ):
			self.useAPI = True
		elif ( self.urltoindex != "" ):
			self.useIndex = True

		if ( self.useAPI == True ):
			Output.message( "Checking api.php..." )
			if not ( self.urltoapi.startswith( "http://" ) ) and not ( self.urltoapi.startswith( "https://" ) ):
				Output.error( "The URL to api.php must start with either http:// or https://!" )
				sys.exit(1)
			elif ( self.checkAPI() ):
				Output.message( "api.php is okay" )
			else:
				Output.error( "There is an error with api.php, please provide a correct path to it." )
				sys.exit(1)
		elif ( self.useIndex == True ):
			Output.message( "Checking index.php..." )
			if not ( self.urltoindex.startswith( "http://" ) ) and not ( self.urltoindex.startswith( "https://" ) ):
				Output.error( "The URL to index.php must start with either http:// or https://!" )
				sys.exit(1)
			elif ( self.checkIndex() ):
				Output.message( "index.php is okay" )
			else:
				Output.error( "There is an error with index.php, please provide a correct path to it." )
				sys.exit(1)

	def resumeDump(self):
		"""
		Resume an incomplete dump defined in self.path.
		"""
		# TODO: Add support for resuming dumps.
		os.chdir( self.path )
		self.loadConfig()
		self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date )
		self.domain = self.makeNiceURL( domain )
		if ( self.useAPI == True ):
			self.urltoindex = "%s/index.php" % ( self.domain )
		self.tasklist = sorted( self.tasklist )
		for task in self.tasklist:
			if ( task == "axml" ):
				DumpXML.run()
			elif ( task == "bimages" ):
				DumpImages.run()
			elif ( task == "clogs" ):
				DumpLogs.run()

	def run(self):
		"""
		Run the whole script itself and excute important functions.
		"""
		print self.welcome()
		Updater.checkRevision()
		# Check if previously there was a log file in the working directory and remove it if exists
		# This is followed by the equivalent of "touch" in Unix to create an empty file
		if ( os.path.exists( Output.logfile ) ):
			os.remove( Output.logfile )
			open( Output.logfile, "a" ).close()
		else:
			open( Output.logfile, "a" ).close()
		self.processargs()
		if ( DumpGenerator.nolog or DumpGenerator.debugmode):
			# Remove the dumpgenerator.log file
			os.remove( Output.logfile )
		if ( self.useAPI == True ):
			domain = self.urltoapi
		elif ( self.useIndex == True ):
			domain = self.urltoindex
		directories = os.walk( "." ).next()[1]
		for directory in directories:
			# Check if there is a dump that already exists in the current working directory
			if ( directory.startswith( self.makePrefix( domain ) ) and directory.endswith( "-wikidump" ) ):
				print "" # Create a blank line
				Output.warn( "There seems to be a similar dump at %s which might be incomplete." % ( directory ) )
				if ( self.autonomous == True ):
					Output.message( "Since auto pilot mode is enabled, that dump will not be resumed." )
					self.resume = False
				else:
					Output.warn( "Do you wish to resume using configuration from that dump? [yes, y], [no, n]" )
					reply = ""
					while reply.lower() not in [ "yes", "y", "no", "n" ]:
						reply = raw_input( "Answer: " )
					if ( reply.lower() in [ "yes", "y" ] ):
						if not ( os.path.isfile( "%s/%s" % ( directory, self.configfile ) ) ):
							Output.error( "I cannot find a %s in the directory! Please delete that directory before re-running!" % ( self.configfile ) )
							sys.exit(1)
						else:
							Output.warn( "Resuming dump and ignoring configuration given in this session..." )
							self.resume = True
							self.path = directory
							break
					elif ( reply.lower() in [ "no", "n" ] ):
						Output.message( "Not resuming..." )
						self.resume = False
			else:
				continue
		if ( self.resume == True ):
			self.resumeDump()
		else:
			self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date )
			self.domain = self.makeNiceURL( domain )
			workingdir = "%s-wikidump" % ( self.prefix )
			if ( os.path.exists( workingdir ) ):
				if ( self.autonomous == True ):
					Output.message( "Since auto pilot mode is enabled, the directory with the same name will be deleted." )
					reply = "yes"
				else:
					Output.warn( "\nThere seems to be a directory with the same name, delete the old one? [yes, y], [no, n]" )
					reply = ""
				while reply.lower() not in [ "yes", "y", "no", "n" ]:
					reply = raw_input( "Answer: " )
				if ( reply.lower() in [ "yes", "y" ] ):
					try:
						shutil.rmtree( workingdir )
					except:
						Output.error( "There was a problem deleting the directory, please manually delete it before re-running!" )
						sys.exit(1)
					print "" # Create a blank line
				elif ( reply.lower() in [ "no", "n" ] ):
					Output.error( "Existing directory exists, either delete that directory or rename it before re-running!" )
					sys.exit(1)
			else:
				pass
			Output.message( "Generating a new dump into a new directory..." )
			os.mkdir( workingdir )
			os.rename( Output.logfile, "%s/%s" % ( workingdir, Output.logfile ) )
			os.chdir( workingdir )
			self.saveConfig()
			# Guess the URL to index.php
			if ( self.useAPI == True ):
				self.urltoindex = "%s/index.php" % ( self.domain )
			if ( self.debugmode == True ):
				self.debug()
			else:
				# Run every single task that we are assigned to do in order: xml, images, logs
				# The "a", "b" and "c" prefix is just to force the order.
				self.tasklist = sorted( self.tasklist )
				if ( self.tasklist == [] ):
					Output.error( "You did not tell me what dump to create!" )
				else:
					for task in self.tasklist:
						if ( task == "axml" ):
							DumpXML.run()
						elif ( task == "bimages" ):
							DumpImages.run()
						elif ( task == "clogs" ):
							DumpLogs.run()
					self.downloadHtmlPages()
					print self.bye()

	def saveConfig(self):
		"""
		Save the configuration settings provided.
		"""
		self.configoptions[ "date" ] = self.date
		output = open( self.configfile, "w" )
		json.dump( self.configoptions, output, indent=4 )

	def version(self):
		"""
		Displays the version information and credits of the script.

		Returns: Version information and credits
		"""
		message = """DumpGenerator %s by WikiTeam

Copyright (C) 2013 Hydriz Scholz
Copyright (C) 2014 WikiTeam

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program. If not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit
<http://www.gnu.org/copyleft/gpl.html>
""" % (self.Version)
		return message

	def welcome(self):
		"""
		Welcomes the user at the very beginning of the script running process.

		Returns: Welcome message.
		"""
		message = """########## Welcome to DumpGenerator %s by WikiTeam ##########\n""" % (self.Version)
		return message

class DumpImages:
	"""
	The class for generating an image dump.
	"""
	def __init__(self):
		"""
		The constructor function.
		"""
		self.files = []

	def dumpImages(self):
		"""
		Download all the images on the wiki with their corresponding XML.
		"""
		if ( DumpGenerator.useAPI == True ):
			self.getFileListAPI()
		else:
			self.getFileListIndex()
		filecount = 0
		if ( self.files == [] ):
			pass
		else:
			Output.message( "Downloading files and their descriptions into \"images\" directory..." )
			for media in self.files:
				time.sleep( DumpGenerator.delay ) # Delay between requests
				urllib.urlretrieve( media[ "url" ], "images/%s" % (media[ "name" ] ) )
				title = DumpGenerator.fixHTMLEntities( media[ "title" ].encode( "utf-8" ) )
				contentsfile = DumpXML.getXMLPage( title, siteinfo=True )
				destfile = "images/%s.xml" % ( media[ "name" ] )
				shutil.move( contentsfile, destfile )
				Output.appendToFile( destfile, "</mediawiki>\n" )
				filecount += 1
				if ( filecount % 10 == 0 ):
					# Give the user a regular status report so that it does not look stuck
					Output.message( "    Downloaded %d files." % ( filecount ) )
			if ( filecount == 1 ):
				Output.message( "Downloaded 1 file." % ( filecount ) )
			else:
				Output.message( "Downloaded %d files." % ( filecount ) )

	def getFileListAPI(self):
		"""
		Download the list of files on the wiki via the API.
		"""
		files = []
		dumpfile = "%s-images.txt" % ( DumpGenerator.prefix )
		filecount = 0
		Output.message( "Getting list of files on the wiki..." )
		aifrom = "!" # Very first page of a wiki
		while aifrom:
			sys.stderr.write('.') # Tell the user that downloading is in progress
			query = {
				"list": "allimages",
				"aifrom": aifrom,
				"ailimit": 500 } # The default limit for anonymous users of the API is 500 pages per request
			time.sleep( DumpGenerator.delay ) # Delay between requests
			filesmeta = json.loads( RequestAPI.query( query ) )
			# Store what the server tells us to continue from
			try:
				serveraifrom = filesmeta[ "query-continue" ][ "allimages" ][ "aicontinue" ]
				aifrom = DumpGenerator.fixHTMLEntities( serveraifrom )
			except:
				# Reached the end of having to keep continuing, exit the while condition
				aifrom = ""
			# TODO: On a wiki with a lot of files, this can cause huge memory problems
			files.extend( filesmeta[ "query" ][ "allimages" ] )
			for media in filesmeta[ "query" ][ "allimages" ]:
				outputline = "%s\t%s\n" % ( media[ "title" ], media[ "url" ] )
				Output.appendToFile( dumpfile, outputline )
			# Add to namespace page count
			filecount += len( files )
		Output.appendToFile( dumpfile, "--END--" )
		if ( filecount == 1 ):
			Output.message( "    Got 1 file" )
		else:
			Output.message( "    Got %d files" % ( filecount ) )

		if ( filecount == 0 ):
			Output.warn( "There are no files on the wiki to download!" )
		else:
			Output.message( "File names and URLs saved at %s." % ( dumpfile ) )
		self.files = files

	def getFileListIndex(self):
		"""
		Download the list of files on the wiki via index.php.
		"""
		# TODO: Add code here

	def run(self):
		"""
		Execute the process of producing an image dump.
		"""
		if ( os.path.isdir( "images" ) ):
			time.sleep(0)
		else:
			os.mkdir( "images" )
		self.dumpImages()

class DumpLogs:
	"""
	The class for generating a log pages dump (pages in Special:Log).
	"""
	def __init__(self):
		"""
		The constructor function.
		"""

	def run(self):
		"""
		Execute the process of producing a log pages dump.
		"""
		# TODO: Support downloading of log pages
		Output.warn( "Sorry, downloading of log pages are not yet supported!" )

class DumpXML:
	"""
	The class for generating an XML dump.
	"""
	def __init__(self):
		"""
		The constructor function.
		"""
		self.lennamespaces = 0
		self.namespaces = {}
		self.pagetitles = []
		self.titlesdumpfile = ""
		self.dumpretrycount = 0

	def dumpPageTitlesAPI(self):
		"""
		Get a list of page titles and outputs it to a file.
		"""
		self.getNamespacesAPI()
		self.getPageTitlesAPI()
		Output.message( "Saving list of page titles..." )
		Output.appendToFile( self.titlesdumpfile, "--END--" )
		Output.message( "List of page titles saved at %s." % ( self.titlesdumpfile ) )

	def dumpXML(self):
		"""
		Get the whole wiki in an XML file.
		"""
		Output.message( "Downloading the XML of every page..." )
		if ( DumpGenerator.curonly == True ):
			dumpfile = "%s-curonly.xml" % ( DumpGenerator.prefix )
		else:
			dumpfile = "%s-history.xml" % ( DumpGenerator.prefix )
		pagecount = 0
		# To reduce memory usage, we are storing the title into memory only when we need it
		for title in file( self.titlesdumpfile, "r" ).read().splitlines():
			pagecount += 1
			numberofedits = 0
			# Add the initial siteinfo and header tags for the first page
			if ( pagecount == 1 ):
				contentsfile = self.getXMLPage( title, siteinfo=True )
				contents = file( contentsfile, "r" ).readlines()
				open( dumpfile, "a" ).close() # "touch" the file
				os.remove( contentsfile )
			elif ( title == "--END--" ):
				contents = [ "</mediawiki>\n" ]
			else:
				contentsfile = self.getXMLPage( title )
				contents = file( contentsfile, "r" ).readlines()
				os.remove( contentsfile )

			for content in contents:
				# Count the number of occurrences of "<timestamp>" to determine number of revisions
				if ( "<timestamp>" in content ):
					numberofedits += 1
				Output.appendToFile( dumpfile, content )
			if ( title == "--END--" ):
				pass
			else:
				if ( numberofedits == 1 ):
					Output.message( "    %s, 1 edit" % ( title ) )
				else:
					Output.message( "    %s, %s edits" % ( title, numberofedits ) )
			if ( pagecount % 10 == 0 ):
				Output.message( "Downloaded %d pages" % ( pagecount ) )
		Output.message( "XML dump saved at %s." % ( dumpfile ) )
		self.integrityCheck( dumpfile )

	def getNamespacesAPI(self):
		"""
		Download the list of namespaces with their names and IDs
		via the API.
		"""
		query = {
			"meta": "siteinfo",
			"siprop": "namespaces" }
		namespacedetails = json.loads( RequestAPI.query( query ) )
		namespacenums = namespacedetails[ "query" ][ "namespaces" ].keys()
		# Remove the system namespaces ("Media" and "Special")
		namespacenums.remove( "-2" )
		namespacenums.remove( "-1" )
		namespaces = {}
		for namespacenum in namespacenums:
			namespacename = namespacedetails[ "query" ][ "namespaces" ][ namespacenum ][ "*" ]
			namespaces[ namespacenum ] = namespacename
		self.lennamespaces = len( list( namespacenums ) )
		Output.message( "%d namespaces found." % ( self.lennamespaces ) )
		self.namespaces = namespaces

	def getPageTitlesAPI(self):
		"""
		Grab a list of page titles in each namespace via the API.

		There are leading spaces in the outputs so as to make things neater on the terminal.
		"""
		titles = []
		self.titlesdumpfile = "%s-titles.txt" % ( DumpGenerator.prefix )
		totalpagecount = 0
		for namespace in self.namespaces:
			if namespace in DumpGenerator.exnamespaces:
				Output.warn( "    Skipping namespace %s" % (namespace) )
			else:
				pagecount = 0
				Output.message( "    Getting titles in namespace %s" % (namespace) )
				apfrom = "!" # Very first page of a wiki
				while apfrom:
					sys.stderr.write( "." ) # Tell the user that downloading is in progress
					query = {
						"list": "allpages",
						"apnamespace": namespace,
						"apfrom": apfrom,
						"aplimit": 500 } # The default limit for anonymous users of the API is 500 pages per request
					time.sleep( DumpGenerator.delay ) # Delay between requests
					pagetitles = json.loads( RequestAPI.query( query ) )
					# Store what the server tells us to continue from
					try:
						serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apcontinue" ]
						apfrom = DumpGenerator.fixHTMLEntities( serverapfrom )
					except:
						try:
							serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apfrom" ]
							apfrom = DumpGenerator.fixHTMLEntities( serverapfrom )
						except:
							# Reached the end of having to keep continuing, exit the while condition
							apfrom = ""
					pages = pagetitles[ "query" ][ "allpages" ]
					# Add to namespace page count
					pagecount += len( pages )
					for page in pages:
						title = "%s\n" % ( page[ "title" ] )
						Output.appendToFile( self.titlesdumpfile, title )
				if ( pagecount == 1 ):
					Output.message( "    Got 1 page title in namespace %s" % ( namespace ) )
				else:
					Output.message( "    Got %d page titles in namespace %s" % ( pagecount, namespace ) )
				# Add to total page count
				totalpagecount += pagecount
		if ( totalpagecount == 1 ):
			Output.message( "Got 1 page title in total." % ( totalpagecount ) )
		else:
			Output.message( "Got %d page titles in total." % ( totalpagecount ) )

	def getXMLPage(self, page, siteinfo=False):
		"""
		Get the XML of one page.

		Input:
		 - page: The title of the page to download.
		 - siteinfo: Whether to include the siteinfo header in the XML.
		"""
		parameters = {
			"title": "Special:Export",
			"pages": page,
			"action": "submit" }
		if ( DumpGenerator.curonly == True ):
			parameters[ "curonly" ] = 1
			parameters[ "limit" ] = 1
		else:
			# Make the wiki download the actual full history
			parameters["history"] = "1"
		# TODO: Can cause memory problems if the page has a huge history
		result = RequestIndex.query( parameters )
		pagehash = hashlib.sha256( page ).hexdigest()[:8]
		tempfile = "%s.xml.tmp" % ( pagehash )
		tempfile2 = "%s.xml" % ( pagehash )
		Output.appendToFile( tempfile, result )
		result = "" # Free up memory
		# Warning: The following is NOT compatible with MediaWiki XML Schema Description version 0.3 and below!
		# See http://wikiteam.googlecode.com/svn/trunk/schema/README.md for more information about MediaWiki versions
		# this will affect and ways to overcome it.
		if ( siteinfo == False ):
			linecount = 0
			# The 11 comes from lines like <siteinfo>, "special" namespaces and the very first line
			# TODO: Hacky way of removing the siteinfo, check for backward compatibility!
			linestoskip = 11 + self.lennamespaces
			for line in open( tempfile, "r" ).read().splitlines():
				linecount += 1
				if linecount > linestoskip:
					if ( "</mediawiki>" in line ):
						pass
					else:
						line = "%s\n" % ( line )
						Output.appendToFile( tempfile2, line )
				else:
					continue
		else:
			for line in open( tempfile, "r" ).read().splitlines():
				if ( "</mediawiki>" in line ):
					pass
				else:
					line = "%s\n" % ( line )
					Output.appendToFile( tempfile2, line )
		os.remove( tempfile )
		return tempfile2

	def integrityCheck(self, dumpfile):
		"""
		Checks the integrity of the XML dump and ensures that it is not corrupted.
		"""
		Output.message( "Checking the integrity of the XML dump..." )
		checktitles = 0
		checkpageopen = 0
		checkpageclose = 0
		checkrevisionopen = 0
		checkrevisionclose = 0
		# Check the number of instances of the following tags
		# By logic they should be the same number
		for line in file( dumpfile, "r" ).read().splitlines():
			if "<title>" in line:
				checktitles += 1
			elif "<page>" in line:
				checkpageopen += 1
			elif "</page>" in line:
				checkpageclose += 1
			elif "<revision>" in line:
				checkrevisionopen += 1
			elif "</revision>" in line:
				checkrevisionclose += 1
			else:
				continue

		if ( checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose ):
			Output.message( "Excellent, the XML dump is not corrupted." )
		else:
			Output.warn( "WARNING: XML dump seems to be corrupted." )
			if ( DumpGenerator.autonomous == True ):
				reply = "yes"
			else:
				reply = ""
			while reply.lower() not in [ "yes", "y", "no", "n" ]:
				reply = raw_input( 'Regenerate a new dump ([yes, y], [no, n])? ' )
			if reply.lower() in [ "yes", "y" ]:
				self.dumpretrycount += 1
				if ( self.dumpretrycount < 3 ):
					Output.warn( "Generating a new dump..." )
					os.remove( dumpfile )
					self.dumpXML()
				else:
					Output.warn( "We have tried dumping the wiki 3 times, but the dump is still corrupted. Not going to carry on since it is probably a problem on the wiki." )
					# Encourage the user to tell us about this faulty wiki
					print "Please tell us about this by reporting an issue here: https://code.google.com/p/wikiteam/issues/list. Thank you!"
					print "Giving you a little time to see this message..."
					time.sleep(3) # Give time for the user to see the message
			elif reply.lower() in [ "no", "n" ]:
				Output.warn( "Not generating a new dump. Note: Your dump is corrupted and might not work with MediaWiki!" )

	def run(self):
		"""
		Execute the process of producing an XML dump.
		"""
		if ( DumpGenerator.useAPI == True ):
			if ( DumpGenerator.titlesonly == True ):
				self.dumpPageTitlesAPI()
			else:
				if ( DumpGenerator.titles != "" ):
					Output.message( "Using the list of page titles provided at %s." % ( DumpGenerator.titles ) )
					self.titlesdumpfile = DumpGenerator.titles
				else:
					self.dumpPageTitlesAPI()
				self.dumpXML()
		else:
			if ( DumpGenerator.titlesonly == True ):
				self.dumpPageTitlesIndex()
			else:
				if ( DumpGenerator.titles != "" ):
					self.titlesdumpfile = DumpGenerator.titles
				else:
					self.dumpPageTitlesIndex()
				self.dumpXML()

class Output:
	"""
	The class to output anything to the user or to a place not within the script.

	For doing outputs to user:
		This is used instead of directly using the "print" function is because
		this is intended to log everything that is told to the user, so that it
		is possible to check when and where things went wrong.

	For doing outputs to elsewhere:
		This is to reduce memory usage by storing large chunks of data into disk
		and reducing the risk of getting a MemoryError.
	"""
	def __init__(self):
		self.logfile = "dumpgenerator.log"

	# Output to disk
	def appendToFile(self, outputfile, contents):
		"""
		Output contents to file.

		Inputs:
		 - outputfile: The file to output to.
		 - contents: The content to add for each line.
		"""
		if ( os.path.exists( outputfile ) == False ):
			open( outputfile, "a" ).close() # "touch" the file
		else:
			pass
		thefile = open( outputfile, "a" )
		try:
			contents = contents.encode( "utf-8", "ignore" )
		# TODO: During a test phase, this error kept coming up, though the final output was no different from
		# what was produced using dumpBackup.php and using Special:Export itself.
		except UnicodeDecodeError:
			pass
		thefile.write( contents )
		thefile.close()

	# Output to user
	def error(self, message):
		print message
		print "Write --help for more information."
		self.log( "An error occurred: %s" % (message) )

	def log(self, message):
		if ( DumpGenerator.nolog or DumpGenerator.debugmode):
			# Skip logging
			time.sleep(0)
		else:
			timestamp = datetime.datetime.fromtimestamp( time.time() ).strftime( "%Y-%m-%d %H:%M:%S" )
			logline = "%s: %s\n" % (timestamp, message)
			self.appendToFile( self.logfile, logline )

	def message(self, message):
		print message
		self.log( "Told the user: %s" % (message) )

	def warn(self, message):
		print message
		self.log( "Warned the user: %s" % (message) )

class RequestAPI:
	"""
	The RequestAPI class, to submit APi request calls to the server.
	"""
	def __init__(self):
		"""
		The constructor function.
		"""

	def query(self, params, url=""):
		"""
		The function to send an API call to the server given in the "url"
		parameter using the parameters found in params. If url is empty,
		DumpGenerator.urltoapi is used instead.

		Note: This function will assume action=query, other functions provides
		the other query forms, but not this one.

		Input:
		 - params: Parameters to API call as an array (excluding action=query and format=json)

		Returns
		 - Result of API call in JSON format.
		"""
		if ( url == "" ):
			url = DumpGenerator.urltoapi
		else:
			url = url
		queryurl = "%s?action=query&format=json" % ( url )
		headers = { "User-Agent": DumpGenerator.UserAgent }
		# Convert the array to a proper URL
		paras = urllib.urlencode( params )
		# POST the parameters to the server
		request = urllib2.Request( queryurl, paras, headers )
		try:
			result = urllib2.urlopen( request )
		except:
			try:
				# Add a little delay between requests if server is slow
				sleeptime = DumpGenerator.delay + 10
				Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) )
				time.sleep( sleeptime )
				result = urllib2.urlopen( request )
			except:
				Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." )
				sys.exit(2)
		output = result.read()
		result.close()
		return output

class RequestIndex:
	def __init__(self):
		"""
		The constructor function.
		"""

	def query(self, params, url=""):
		"""
		The function to send an request to the server given in the "url"
		parameter using the parameters found in params. If url is empty,
		DumpGenerator.urltoindex is used instead.

		Input:
		 - params: Parameters to the request to send, appended to url as
		   a GET request.

		Returns
		 - Result of GET request.
		"""
		if ( url == "" ):
			url = DumpGenerator.urltoindex
		else:
			url = url
		headers = { "User-Agent": DumpGenerator.UserAgent }
		paras = urllib.urlencode( params )
		# index.php does not support POST request, formulating a correct GET URL here
		queryurl = "%s?%s" % ( url, paras )
		request = urllib2.Request( queryurl, headers=headers )
		# TODO: Make urlopen follow redirects
		try:
			result = urllib2.urlopen( request )
		except:
			try:
				# Add a little delay between requests if server is slow
				sleeptime = DumpGenerator.delay + 10
				Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) )
				time.sleep( sleeptime )
				result = urllib2.urlopen( request )
			except:
				Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." )
				sys.exit(2)
		output = result.read()
		result.close()
		return output

	def removeIP(self, content):
		"""
		Remove the user's IP address while fetching HTML pages.
		"""
		# Remove IPv4 addresses
		content = re.sub( r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", content )
		# Remove IPv6 addresses
		content = re.sub( r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}", "0:0:0:0:0:0:0:0", content )
		return content

class Updater:
	"""
	The class to auto-update the user's script to the latest version of DumpGenerator.
	"""
	# TODO: Get the script to check only occasionally, this is a performance concern
	def __init__(self):
		"""
		The constructor function.
		"""
		self.controlUrl = "http://wikiteam.googlecode.com/svn/trunk/revnum.json"
		self.controlUrl2 = "https://raw.github.com/dumps/DumpGenerator/master/revnum.json"
		self.result = {}

	def checkRevision(self):
		"""
		Check the current revision and ensure that it is up-to-date.
		"""
		jsonresult = self.getRevisionJson()
		if ( jsonresult == False ):
			pass
		else:
			result = json.loads( jsonresult )
			self.result = result
			if ( result[ "latest" ] == DumpGenerator.Version ):
				if ( result[ "releases" ][ DumpGenerator.Version ][ "revision" ] == DumpGenerator.revision ):
					pass
				else:
					self.update()
			else:
				self.update()

	def getRevisionJson(self):
		"""
		Download the controlling JSON file.
		"""
		headers = {'User-Agent': DumpGenerator.UserAgent}
		skip = False
		# TODO: Handle 404 errors
		try:
			revjson = urllib2.urlopen( urllib2.Request( self.controlUrl, headers=headers ) )
		except:
			try:
				revjson = urllib2.urlopen( urllib2.Request( self.controlUrl2, headers=headers ) )
			except:
				Output.warn( "Unable to check if a new version of dumpgenerator.py is available, continuing..." )
				skip = True
		if ( skip == False ):
			output = revjson.read()
			revjson.close()
			return output
		else:
			return False

	def update(self):
		"""
		Update DumpGenerator.py to the current latest version
		"""
		currentfile = sys.argv[0]
		latestver = self.result[ "latest" ]
		latestrev = self.result[ "releases" ][ latestver ][ "revision" ]
		latesturl = self.result[ "releases" ][ latestver ][ "downloadurl" ]
		latesturl2 = self.result[ "releases" ][ latestver ][ "downloadurl2" ]
		updated = True
		# TODO: Handle 404 errors
		try:
			urllib.urlretrieve( latesturl, currentfile )
		except:
			try:
				urllib.urlretrieve( latesturl2, currentfile )
			except:
				updated = False
		if ( updated == False ):
			Output.warn( "Unable to update DumpGenerator, skipping update for now..." )
		else:
			Output.message( "DumpGenerator was updated to %s (revision %s)! Changes will take effect on next run." % ( latestver, latestrev ) )

if __name__ == "__main__":
	# Class registry, for use throughout the whole script
	RequestAPI = RequestAPI()
	RequestIndex = RequestIndex()
	DumpGenerator = DumpGenerator()
	DumpImages = DumpImages()
	DumpLogs = DumpLogs()
	DumpXML = DumpXML()
	Output = Output()
	Updater = Updater()

	# Start everything up
	DumpGenerator.run()