2011-04-05 16:18:18 +00:00
# -*- coding: utf-8 -*-
# Copyright (C) 2011 emijrp
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2011-04-07 13:14:37 +00:00
import cPickle
2011-04-05 22:00:29 +00:00
import datetime
2011-04-07 15:43:17 +00:00
import getopt
2011-04-05 22:00:29 +00:00
import os
2011-04-05 16:18:18 +00:00
import re
2011-04-05 22:00:29 +00:00
import subprocess
2011-04-05 16:18:18 +00:00
import sys
2011-04-07 15:56:48 +00:00
import time
2011-04-05 16:18:18 +00:00
import urllib
2011-04-05 22:00:29 +00:00
import urllib2
2011-04-05 16:18:18 +00:00
# todo:
# curonly and all history (curonly si puede acumular varias peticiones en un solo GET, ara full history pedir cada pagina una a una)
# usar api o parsear html si no está disponible
# http://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export
2011-04-05 22:38:58 +00:00
# threads para bajar más rápido? pedir varias páginas a la vez
2011-04-05 23:34:52 +00:00
# Special:Log? uploads, account creations, etc
2011-04-06 18:54:33 +00:00
# download Special:Version to save whch extension it used
2011-04-06 20:23:18 +00:00
# que guarde el index.php (la portada) como index.html para que se vea la licencia del wiki abajo del todo
2011-04-07 13:14:37 +00:00
# fix use api when available
2011-04-05 16:18:18 +00:00
2011-04-07 16:10:12 +00:00
def delay ( config = { } ) :
2011-04-07 17:28:08 +00:00
if config [ ' delay ' ] > 0 :
print ' Sleeping... %d seconds... ' % ( config [ ' delay ' ] )
time . sleep ( config [ ' delay ' ] )
2011-04-07 16:10:12 +00:00
2011-04-05 22:00:29 +00:00
def cleanHTML ( raw = ' ' ) :
if re . search ( ' <!-- bodytext --> ' , raw ) : #<!-- bodytext --> <!-- /bodytext --> <!-- start content --> <!-- end content -->
raw = raw . split ( ' <!-- bodytext --> ' ) [ 1 ] . split ( ' <!-- /bodytext --> ' ) [ 0 ]
elif re . search ( ' <!-- start content --> ' , raw ) :
raw = raw . split ( ' <!-- start content --> ' ) [ 1 ] . split ( ' <!-- end content --> ' ) [ 0 ]
else :
print ' This wiki doesn \' t use marks to split contain '
sys . exit ( )
return raw
2011-04-07 22:32:05 +00:00
def getPageTitles ( config = { } , start = ' ! ' ) :
2011-04-06 20:23:18 +00:00
#Get page titles parsing Special:Allpages or using API (fix)
#
2011-04-05 16:18:18 +00:00
#http://en.wikipedia.org/wiki/Special:AllPages
#http://archiveteam.org/index.php?title=Special:AllPages
#http://www.wikanda.es/wiki/Especial:Todas
2011-04-07 15:43:17 +00:00
print ' Loading page titles from namespaces = ' , ' , ' . join ( [ str ( i ) for i in config [ ' namespaces ' ] ] )
2011-04-06 18:54:33 +00:00
2011-04-05 22:00:29 +00:00
#namespace checks and stuff
2011-04-06 20:23:18 +00:00
#fix get namespaces from a randome Special:Export page, it is better
2011-04-05 16:18:18 +00:00
namespacenames = { 0 : ' ' } # main is 0, no prefix
2011-04-07 15:43:17 +00:00
namespaces = config [ ' namespaces ' ]
2011-04-05 16:18:18 +00:00
if namespaces :
2011-04-07 15:43:17 +00:00
raw = urllib . urlopen ( ' %s ?title=Special:Allpages ' % ( config [ ' domain ' ] ) ) . read ( )
2011-04-05 16:18:18 +00:00
m = re . compile ( r ' <option [^>]*?value= " (?P<namespaceid> \ d+) " [^>]*?>(?P<namespacename>[^<]+)</option> ' ) . finditer ( raw ) # [^>]*? to include selected="selected"
if ' all ' in namespaces :
namespaces = [ ]
for i in m :
namespaces . append ( int ( i . group ( " namespaceid " ) ) )
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group ( " namespacename " )
else :
#check if those namespaces really exist in this wiki
namespaces2 = [ ]
for i in m :
if int ( i . group ( " namespaceid " ) ) in namespaces :
namespaces2 . append ( int ( i . group ( " namespaceid " ) ) )
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group ( " namespacename " )
namespaces = namespaces2
else :
namespaces = [ 0 ]
2011-04-05 22:00:29 +00:00
#retrieve all titles from Special:Allpages, if the wiki is big, perhaps there are sub-Allpages to explore
2011-04-05 16:18:18 +00:00
namespaces = [ i for i in set ( namespaces ) ] #uniques
2011-04-06 18:54:33 +00:00
print ' %d namespaces have been found ' % ( len ( namespaces ) )
2011-04-05 16:18:18 +00:00
titles = [ ]
for namespace in namespaces :
2011-04-05 22:00:29 +00:00
print ' Retrieving titles in the namespace ' , namespace
2011-04-07 15:43:17 +00:00
url = ' %s ?title=Special:Allpages&namespace= %s ' % ( config [ ' domain ' ] , namespace )
2011-04-05 22:00:29 +00:00
raw = urllib . urlopen ( url ) . read ( )
raw = cleanHTML ( raw )
2011-04-05 16:18:18 +00:00
2011-04-05 22:00:29 +00:00
r_title = r ' title= " (?P<title>[^>]+) " > '
r_suballpages = r ' &from=(?P<from>[^>]+)&to=(?P<to>[^>]+) " > '
deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
c = 0
checked_suballpages = [ ]
2011-04-05 22:08:53 +00:00
rawacum = raw
2011-04-05 22:00:29 +00:00
while re . search ( r_suballpages , raw ) and c < deep :
#load sub-Allpages
m = re . compile ( r_suballpages ) . finditer ( raw )
for i in m :
fr = i . group ( ' from ' )
to = i . group ( ' to ' )
name = ' %s - %s ' % ( fr , to )
if not name in checked_suballpages :
checked_suballpages . append ( name )
2011-04-07 15:43:17 +00:00
url = ' %s ?title=Special:Allpages&namespace= %s &from= %s &to= %s ' % ( config [ ' domain ' ] , namespace , fr , to ) #do not put urllib.quote in fr or to
2011-04-05 22:00:29 +00:00
raw2 = urllib . urlopen ( url ) . read ( )
raw2 = cleanHTML ( raw2 )
rawacum + = raw2 #merge it after removed junk
print ' Detected sub-Allpages: ' , name , len ( raw2 ) , ' bytes ' , len ( re . findall ( r_title , raw2 ) )
c + = 1
2011-04-05 16:18:18 +00:00
2011-04-05 22:00:29 +00:00
m = re . compile ( r_title ) . finditer ( rawacum )
2011-04-05 16:18:18 +00:00
for i in m :
if not i . group ( ' title ' ) . startswith ( ' Special: ' ) :
2011-04-05 22:00:29 +00:00
if not i . group ( ' title ' ) in titles :
titles . append ( i . group ( ' title ' ) )
2011-04-06 18:54:33 +00:00
print ' %d page titles loaded ' % ( len ( titles ) )
2011-04-05 16:18:18 +00:00
return titles
2011-04-07 15:43:17 +00:00
def getXMLHeader ( config = { } ) :
2011-04-05 22:00:29 +00:00
#get the header of a random page, to attach it in the complete XML backup
#similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....
randomtitle = ' AMF5LKE43MNFGHKSDMRTJ '
2011-04-07 15:43:17 +00:00
xml = getXMLPage ( config = config , title = randomtitle )
2011-04-05 22:00:29 +00:00
header = xml . split ( ' </mediawiki> ' ) [ 0 ]
return header
2011-04-08 13:39:14 +00:00
def getXMLFileDesc ( config = { } , title = ' ' ) :
config [ ' curonly ' ] = 1 #tricky to get only the most recent desc
return getXMLPage ( config = config , title = title )
2011-04-07 15:43:17 +00:00
def getXMLPage ( config = { } , title = ' ' ) :
2011-04-05 22:00:29 +00:00
#http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
2011-04-06 18:54:33 +00:00
limit = 1000
2011-04-05 23:34:52 +00:00
truncated = False
2011-04-05 22:00:29 +00:00
title_ = re . sub ( ' ' , ' _ ' , title )
headers = { ' User-Agent ' : ' Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4 ' }
2011-04-06 12:26:13 +00:00
params = { ' title ' : ' Special:Export ' , ' pages ' : title_ , ' action ' : ' submit ' , }
2011-04-07 15:43:17 +00:00
if config [ ' curonly ' ] :
2011-04-05 22:00:29 +00:00
params [ ' curonly ' ] = 1
else :
2011-04-05 22:38:58 +00:00
params [ ' offset ' ] = ' 1 '
2011-04-05 22:00:29 +00:00
params [ ' limit ' ] = limit
2011-04-05 22:38:58 +00:00
data = urllib . urlencode ( params )
2011-04-07 20:24:30 +00:00
req = urllib2 . Request ( url = config [ ' domain ' ] , data = data , headers = headers )
2011-04-07 21:04:17 +00:00
try :
f = urllib2 . urlopen ( req )
except :
try :
print ' Sever is slow... Waiting some seconds and retrying... '
time . sleep ( 10 )
f = urllib2 . urlopen ( req )
except :
print ' An error have occurred while retrieving ' , title
print ' Please, resume the dump, --resume '
sys . exit ( )
2011-04-05 22:00:29 +00:00
xml = f . read ( )
2011-04-05 23:00:33 +00:00
#if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
#else, warning about Special:Export truncating large page histories
r_timestamp = r ' <timestamp>([^<]+)</timestamp> '
2011-04-07 15:43:17 +00:00
if not config [ ' curonly ' ] and re . search ( r_timestamp , xml ) : # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
2011-04-06 11:39:02 +00:00
while not truncated and params [ ' offset ' ] :
params [ ' offset ' ] = re . findall ( r_timestamp , xml ) [ - 1 ] #get the last timestamp from the acum XML
2011-04-05 22:38:58 +00:00
data = urllib . urlencode ( params )
2011-04-07 20:24:30 +00:00
req2 = urllib2 . Request ( url = config [ ' domain ' ] , data = data , headers = headers )
2011-04-07 21:10:57 +00:00
try :
f2 = urllib2 . urlopen ( req2 )
except :
try :
print ' Sever is slow... Waiting some seconds and retrying... '
time . sleep ( 10 )
f2 = urllib2 . urlopen ( req2 )
except :
print ' An error have occurred while retrieving ' , title
print ' Please, resume the dump, --resume '
sys . exit ( )
2011-04-05 22:00:29 +00:00
xml2 = f2 . read ( )
2011-04-06 11:39:02 +00:00
if re . findall ( r_timestamp , xml2 ) : #are there more edits in this next XML chunk?
if re . findall ( r_timestamp , xml2 ) [ - 1 ] == params [ ' offset ' ] :
#again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000)
print ' ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories may be truncated '
truncated = True
break
else :
#offset is OK in this wiki, merge with the previous chunk of this page history and continue
xml = xml . split ( ' </page> ' ) [ 0 ] + xml2 . split ( ' <page> \n ' ) [ 1 ]
2011-04-05 23:00:33 +00:00
else :
2011-04-06 11:39:02 +00:00
params [ ' offset ' ] = ' ' #no more edits in this page history
print title , len ( re . findall ( r_timestamp , xml ) ) , ' edits '
2011-04-05 22:00:29 +00:00
return xml
def cleanXML ( xml = ' ' ) :
xml = xml . split ( ' </siteinfo> \n ' ) [ 1 ]
xml = xml . split ( ' </mediawiki> ' ) [ 0 ]
return xml
2011-04-05 16:18:18 +00:00
2011-04-07 20:24:30 +00:00
def generateXMLDump ( config = { } , titles = [ ] , start = ' ' ) :
print ' Retrieving the XML for every page from " %s " ' % ( start and start or ' start ' )
header = getXMLHeader ( config = config )
2011-04-06 11:39:02 +00:00
footer = ' </mediawiki> \n ' #new line at the end
2011-04-07 20:24:30 +00:00
xmlfilename = ' %s - %s - %s .xml ' % ( domain2prefix ( domain = config [ ' domain ' ] ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' )
xmlfile = ' '
lock = True
if start :
#remove the last chunk of xml dump (it is probably incomplete)
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' r ' )
xml = xmlfile . read ( )
xmlfile . close ( )
xml = xml . split ( ' <title> %s </title> ' % ( start ) ) [ 0 ]
xml = ' \n ' . join ( xml . split ( ' \n ' ) [ : - 2 ] ) # [:-1] removing <page>\n tag
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
xmlfile . write ( ' %s \n ' % ( xml ) )
xmlfile . close ( )
else :
#requested complete xml dump
lock = False
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
xmlfile . write ( header )
xmlfile . close ( )
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' a ' )
2011-04-05 22:38:58 +00:00
c = 1
2011-04-05 22:00:29 +00:00
for title in titles :
2011-04-07 20:24:30 +00:00
if title == start : #start downloading from start, included
lock = False
if lock :
continue
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-04-05 22:38:58 +00:00
if c % 10 == 0 :
print ' Downloaded %d pages ' % ( c )
2011-04-07 20:24:30 +00:00
xml = getXMLPage ( config = config , title = title )
2011-04-05 22:00:29 +00:00
xml = cleanXML ( xml = xml )
xmlfile . write ( xml )
2011-04-05 22:38:58 +00:00
c + = 1
2011-04-05 22:00:29 +00:00
xmlfile . write ( footer )
xmlfile . close ( )
2011-04-07 13:14:37 +00:00
print ' XML dump saved at... ' , xmlfilename
2011-04-06 18:54:33 +00:00
2011-04-07 15:43:17 +00:00
def saveTitles ( config = { } , titles = [ ] ) :
2011-04-06 18:54:33 +00:00
#save titles in a txt for resume if needed
2011-04-07 20:24:30 +00:00
titlesfilename = ' %s - %s -titles.txt ' % ( domain2prefix ( domain = config [ ' domain ' ] ) , config [ ' date ' ] )
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' w ' )
2011-04-07 13:14:37 +00:00
titles . append ( ' --END-- ' )
titlesfile . write ( ' \n ' . join ( titles ) )
titlesfile . close ( )
print ' Titles saved at... ' , titlesfilename
2011-04-06 18:54:33 +00:00
2011-04-07 22:32:05 +00:00
def saveImageFilenamesURL ( config = { } , images = [ ] ) :
#save list of images and their urls
imagesfilename = ' %s - %s -images.txt ' % ( domain2prefix ( domain = config [ ' domain ' ] ) , config [ ' date ' ] )
imagesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , imagesfilename ) , ' w ' )
2011-04-08 14:57:36 +00:00
imagesfile . write ( ' \n ' . join ( [ ' %s \t %s \t %s ' % ( filename , url , uploader ) for filename , url , uploader in images ] ) )
2011-04-07 22:32:05 +00:00
imagesfile . write ( ' \n --END-- ' )
imagesfile . close ( )
print ' Image filenames and URLs saved at... ' , imagesfilename
def getImageFilenamesURL ( config = { } , start = ' ! ' ) :
#fix start is only available if parsing from API, if not, reload all the list from special:imagelist is mandatory
2011-04-07 13:14:37 +00:00
print ' Retrieving image filenames '
r_next = r ' (?<!&dir=prev)&offset=(?P<offset> \ d+)& ' # (?<! http://docs.python.org/library/re.html
images = [ ]
offset = ' 29990101000000 ' #january 1, 2999
while offset :
2011-04-07 15:43:17 +00:00
url = ' %s ?title=Special:Imagelist&limit=5000&offset= %s ' % ( config [ ' domain ' ] , offset )
2011-04-07 13:14:37 +00:00
raw = urllib . urlopen ( url ) . read ( )
raw = cleanHTML ( raw )
2011-04-08 14:57:36 +00:00
#archiveteam <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
#wikanda <td class="TablePager_col_img_user_text"><a href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1" class="new" title="Usuario:Fernandocg (página no existe)">Fernandocg</a></td>
m = re . compile ( r ' (?im)<td class= " TablePager_col_img_name " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a>[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> ' ) . finditer ( raw )
2011-04-07 13:14:37 +00:00
for i in m :
url = i . group ( ' url ' )
2011-04-07 17:28:08 +00:00
if url [ 0 ] == ' / ' : #relative URL
if re . search ( r ' \ . \ ./ ' , url ) : #../ weird paths (see wikanda)
x = len ( re . findall ( r ' \ . \ ./ ' , url ) ) + 1
url = ' %s / %s ' % ( ' / ' . join ( config [ ' domain ' ] . split ( ' / ' ) [ : - x ] ) , url . split ( ' ../ ' ) [ - 1 ] )
else :
url = ' %s %s ' % ( config [ ' domain ' ] . split ( ' /index.php ' ) [ 0 ] , url )
filename = re . sub ( ' _ ' , ' ' , i . group ( ' filename ' ) )
filename_ = re . sub ( ' ' , ' _ ' , i . group ( ' filename ' ) )
2011-04-08 14:57:36 +00:00
uploader = re . sub ( ' _ ' , ' ' , i . group ( ' uploader ' ) )
images . append ( [ filename , url , uploader ] )
2011-04-07 22:32:05 +00:00
#print filename, url
2011-04-07 13:14:37 +00:00
if re . search ( r_next , raw ) :
offset = re . findall ( r_next , raw ) [ 0 ]
else :
offset = ' '
print ' Found %d images ' % ( len ( images ) )
2011-04-07 22:32:05 +00:00
images . sort ( )
return images
def generateImageDump ( config = { } , images = [ ] , start = ' ' ) :
#slurp all the images
#save in a .tar?
#tener en cuenta http://www.mediawiki.org/wiki/Manual:ImportImages.php
2011-04-07 23:01:19 +00:00
#fix, download .desc ? YEP!
#fix download the upload log too, for uploaders info and date
2011-04-07 22:32:05 +00:00
print ' Retrieving images from " %s " ' % ( start and start or ' start ' )
2011-04-07 15:43:17 +00:00
imagepath = ' %s /images ' % ( config [ ' path ' ] )
2011-04-07 13:14:37 +00:00
if os . path . isdir ( imagepath ) :
2011-04-07 16:10:12 +00:00
print ' It exists an images directory for this dump ' #fix, resume?
2011-04-07 13:14:37 +00:00
else :
os . makedirs ( imagepath )
c = 0
2011-04-07 22:32:05 +00:00
lock = True
2011-04-07 23:37:45 +00:00
if not start :
lock = False
2011-04-08 14:57:36 +00:00
for filename , url , uploader in images :
2011-04-07 22:32:05 +00:00
if filename == start : #start downloading from start, included
lock = False
if lock :
continue
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-04-08 13:39:14 +00:00
#saving file
urllib . urlretrieve ( url , ' %s / %s ' % ( imagepath , filename ) )
#saving description if any
xmlfiledesc = getXMLFileDesc ( config = config , title = ' Image: %s ' % ( filename ) )
f = open ( ' %s / %s .desc ' % ( imagepath , filename ) , ' w ' )
if re . search ( r ' <text xml:space= " preserve " /> ' , xmlfiledesc ) :
#empty desc
xmlfiledesc = ' '
elif re . search ( r ' <text xml:space= " preserve " > ' , xmlfiledesc ) :
xmlfiledesc = xmlfiledesc . split ( ' <text xml:space= " preserve " > ' ) [ 1 ] . split ( ' </text> ' ) [ 0 ]
xmlfiledesc = re . sub ( ' < ' , ' < ' , xmlfiledesc ) # i guess only < > & need coversion http://www.w3schools.com/html/html_entities.asp
xmlfiledesc = re . sub ( ' > ' , ' > ' , xmlfiledesc )
xmlfiledesc = re . sub ( ' & ' , ' & ' , xmlfiledesc )
else : #failure when retrieving desc?
xmlfiledesc = ' '
f . write ( xmlfiledesc )
f . close ( )
2011-04-07 13:14:37 +00:00
c + = 1
if c % 10 == 0 :
print ' Downloaded %d images ' % ( c )
print ' Downloaded %d images ' % ( c )
2011-04-07 15:43:17 +00:00
def saveLogs ( config = { } ) :
2011-04-06 19:17:59 +00:00
#get all logs from Special:Log
""" parse
< select name = ' type ' >
< option value = " block " > Bloqueos de usuarios < / option >
< option value = " rights " > Cambios de perfil de usuario < / option >
< option value = " protect " selected = " selected " > Protecciones de páginas < / option >
< option value = " delete " > Registro de borrados < / option >
< option value = " newusers " > Registro de creación de usuarios < / option >
< option value = " merge " > Registro de fusiones < / option >
< option value = " import " > Registro de importaciones < / option >
< option value = " patrol " > Registro de revisiones < / option >
< option value = " move " > Registro de traslados < / option >
< option value = " upload " > Subidas de archivos < / option >
< option value = " " > Todos los registros < / option >
< / select >
"""
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-04-06 19:17:59 +00:00
2011-04-07 13:14:37 +00:00
def domain2prefix ( domain = ' ' ) :
domain = re . sub ( r ' (http://|www \ .|/index \ .php) ' , ' ' , domain )
domain = re . sub ( r ' / ' , ' _ ' , domain )
domain = re . sub ( r ' \ . ' , ' ' , domain )
domain = re . sub ( r ' [^A-Za-z0-9] ' , ' _ ' , domain )
return domain
2011-04-07 15:43:17 +00:00
def loadConfig ( config = { } , configfilename = ' ' ) :
f = open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' r ' )
2011-04-07 13:14:37 +00:00
config = cPickle . load ( f )
f . close ( )
return config
2011-04-07 15:43:17 +00:00
def saveConfig ( config = { } , configfilename = ' ' ) :
f = open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' w ' )
2011-04-07 13:14:37 +00:00
cPickle . dump ( config , f )
f . close ( )
2011-04-07 16:10:12 +00:00
def welcome ( config = { } ) :
2011-04-07 15:43:17 +00:00
print " - " * 75
2011-04-07 16:10:12 +00:00
print """ Welcome to DumpGenerator 0.1 by WikiTeam (GPL v3)
More info at : http : / / code . google . com / p / wikiteam / """
2011-04-07 15:43:17 +00:00
print " - " * 75
2011-04-07 16:10:12 +00:00
def bye ( config = { } ) :
print " Your dump is in %s " % ( config [ ' path ' ] )
2011-04-07 17:28:08 +00:00
print " If you found any bug, report a new issue here (Gmail account required): http://code.google.com/p/wikiteam/issues/list "
2011-04-07 16:10:12 +00:00
print " Good luck! Bye! "
2011-04-07 15:43:17 +00:00
def usage ( ) :
print " Write a complete help "
def getParameters ( ) :
2011-04-07 13:14:37 +00:00
config = {
' curonly ' : False ,
2011-04-07 20:24:30 +00:00
' date ' : datetime . datetime . now ( ) . strftime ( ' % Y % m %d ' ) ,
2011-04-07 13:14:37 +00:00
' domain ' : domain ,
2011-04-07 15:43:17 +00:00
' images ' : False ,
2011-04-07 13:14:37 +00:00
' logs ' : False ,
2011-04-07 15:43:17 +00:00
' xml ' : False ,
2011-04-07 23:04:05 +00:00
' namespaces ' : [ ' all ' ] ,
2011-04-07 15:43:17 +00:00
' path ' : ' ' ,
2011-04-07 23:04:05 +00:00
' threads ' : 1 , #fix not coded yet
2011-04-07 15:56:48 +00:00
' delay ' : 0 ,
2011-04-07 13:14:37 +00:00
}
2011-04-07 15:43:17 +00:00
other = {
' resume ' : False ,
}
#console params
try :
2011-04-07 15:56:48 +00:00
opts , args = getopt . getopt ( sys . argv [ 1 : ] , " " , [ " h " , " help " , " path= " , " domain= " , " images " , " logs " , " xml " , " curonly " , " threads= " , " resume " , " delay= " ] )
2011-04-07 15:43:17 +00:00
except getopt . GetoptError , err :
# print help information and exit:
print str ( err ) # will print something like "option -a not recognized"
usage ( )
sys . exit ( 2 )
for o , a in opts :
if o in ( " -h " , " --help " ) :
usage ( )
elif o in ( " --path " ) :
config [ " path " ] = a
while len ( config [ " path " ] ) > 0 :
if config [ " path " ] [ - 1 ] == ' / ' : #dará problemas con rutas windows?
config [ " path " ] = config [ " path " ] [ : - 1 ]
else :
break
if not config [ " path " ] :
config [ " path " ] = ' . '
elif o in ( " --domain " ) :
config [ " domain " ] = a
elif o in ( " --images " ) :
config [ " images " ] = True
elif o in ( " --logs " ) :
config [ " logs " ] = True
elif o in ( " --xml " ) :
config [ " xml " ] = True
elif o in ( " --curonly " ) :
if not config [ " xml " ] :
print " If you select --curonly, you must use --xml too "
sys . exit ( )
config [ " curonly " ] = True
elif o in ( " --threads " ) :
config [ " threads " ] = int ( a )
elif o in ( " --resume " ) :
other [ " resume " ] = True
2011-04-07 15:56:48 +00:00
elif o in ( " --delay " ) :
config [ " delay " ] = int ( a )
2011-04-07 15:43:17 +00:00
else :
assert False , " unhandled option "
#missing mandatory params
if not config [ " domain " ] or \
not ( config [ " xml " ] or config [ " images " ] or config [ " logs " ] ) :
print """ Error. You forget mandatory parameters:
- - domain : URL to index . php in the wiki ( examples : http : / / en . wikipedia . org / w / index . php or http : / / archiveteam . org / index . php )
And one of these , or two or three :
- - xml : it generates a XML dump . It retrieves full history of pages located in namespace = 0 ( articles )
If you want more namespaces , use the parameter - - namespaces = 0 , 1 , 2 , 3. . . or - - namespaces = all
- - images : it generates an image dump
- - logs : it generates a log dump
Write - - help for help . """
sys . exit ( )
#usage()
2011-04-07 20:24:30 +00:00
#calculating path, if not defined by user with --path=
config [ ' path ' ] = ' ./ %s - %s -wikidump ' % ( domain2prefix ( domain = config [ ' domain ' ] ) , config [ ' date ' ] )
2011-04-07 15:43:17 +00:00
return config , other
def main ( ) :
configfilename = ' config.txt '
config , other = getParameters ( )
2011-04-07 16:10:12 +00:00
welcome ( config = config )
2011-04-06 18:54:33 +00:00
2011-04-07 13:14:37 +00:00
#notice about wikipedia dumps
if re . findall ( r ' (wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews) \ .org ' , config [ ' domain ' ] ) :
2011-04-06 18:54:33 +00:00
print ' DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS! \n Download the dumps from http://download.wikimedia.org \n Thanks! '
sys . exit ( )
2011-04-07 15:56:48 +00:00
print ' Analysing %s ' % ( config [ ' domain ' ] )
2011-04-07 15:43:17 +00:00
#creating path or resuming if desired
2011-04-07 13:14:37 +00:00
c = 2
2011-04-07 16:10:12 +00:00
originalpath = config [ ' path ' ] # to avoid concat blabla-2, blabla-2-3, and so on...
2011-04-07 20:24:30 +00:00
while not other [ ' resume ' ] and os . path . isdir ( config [ ' path ' ] ) : #do not enter if resume is request from begining
2011-04-07 15:43:17 +00:00
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
reply = raw_input ( ' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], otherwise no)? ' % ( config [ ' path ' ] , config [ ' path ' ] , configfilename ) )
2011-04-07 13:14:37 +00:00
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
2011-04-07 20:24:30 +00:00
if not os . path . isfile ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) ) :
print ' No config file found. I can \' t resume. Aborting. '
2011-04-07 13:14:37 +00:00
sys . exit ( )
2011-04-07 15:43:17 +00:00
print ' You have selected YES '
other [ ' resume ' ] = True
2011-04-07 13:14:37 +00:00
break
else :
2011-04-07 15:43:17 +00:00
print ' You have selected NO '
2011-04-07 16:10:12 +00:00
config [ ' path ' ] = ' %s - %d ' % ( originalpath , c )
2011-04-07 15:43:17 +00:00
print ' Trying " %s " ... ' % ( config [ ' path ' ] )
2011-04-07 13:14:37 +00:00
c + = 1
2011-04-07 15:43:17 +00:00
2011-04-07 20:24:30 +00:00
if other [ ' resume ' ] :
print ' Loading config file... '
config = loadConfig ( config = config , configfilename = configfilename )
else :
2011-04-07 15:43:17 +00:00
os . mkdir ( config [ ' path ' ] )
saveConfig ( config = config , configfilename = configfilename )
2011-04-07 13:14:37 +00:00
titles = [ ]
2011-04-07 22:32:05 +00:00
images = [ ]
2011-04-07 15:43:17 +00:00
if other [ ' resume ' ] :
2011-04-07 20:24:30 +00:00
print ' Resuming previous dump process... '
if config [ ' xml ' ] :
#load titles
2011-04-07 22:32:05 +00:00
lasttitle = ' '
try :
f = open ( ' %s / %s - %s -titles.txt ' % ( config [ ' path ' ] , domain2prefix ( domain = config [ ' domain ' ] ) , config [ ' date ' ] ) , ' r ' )
raw = f . read ( )
titles = raw . split ( ' \n ' )
lasttitle = titles [ - 1 ]
f . close ( )
except :
pass #probably file doesnot exists
2011-04-07 20:24:30 +00:00
if lasttitle == ' --END-- ' :
#titles list is complete
2011-04-07 22:32:05 +00:00
print ' Title list was completed in the previous session '
2011-04-07 20:24:30 +00:00
else :
#start = last
2011-04-07 22:32:05 +00:00
#remove complete namespaces and then getPageTitles(config=config, start=start)
#titles += getPageTitles(config=config, start=last)
print ' Title list is incomplete. Resuming... '
2011-04-07 20:24:30 +00:00
#search last
last = ' lastline '
2011-04-07 22:32:05 +00:00
titles = titles [ : - 1 ] #removing last one, next line append from start, and start is inclusive
titles + = getPageTitles ( config = config , start = ' ! ' ) #fix, try resume not reload entirely, change start='!' and develop the feature into getPageTitles()
2011-04-07 20:24:30 +00:00
saveTitles ( config = config , titles = titles )
#checking xml dump
f = open ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( domain = config [ ' domain ' ] ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' )
xml = f . read ( )
f . close ( )
if re . findall ( ' </mediawiki> ' , xml ) :
2011-04-07 13:14:37 +00:00
#xml dump is complete
2011-04-07 20:24:30 +00:00
print ' XML dump was completed in the previous session '
2011-04-07 13:14:37 +00:00
else :
2011-04-07 20:24:30 +00:00
xmltitles = re . findall ( r ' <title>([^<]+)</title> ' , xml )
lastxmltitle = ' '
if xmltitles :
lastxmltitle = xmltitles [ - 1 ]
generateXMLDump ( config = config , titles = titles , start = lastxmltitle )
if config [ ' images ' ] :
2011-04-07 22:32:05 +00:00
#load images
lastimage = ' '
try :
f = open ( ' %s / %s - %s -images.txt ' % ( config [ ' path ' ] , domain2prefix ( domain = config [ ' domain ' ] ) , config [ ' date ' ] ) , ' r ' )
raw = f . read ( )
lines = raw . split ( ' \n ' )
for l in lines :
if re . search ( r ' \ t ' , l ) :
images . append ( l . split ( ' \t ' ) )
lastimage = lines [ - 1 ]
f . close ( )
except :
pass #probably file doesnot exists
if lastimage == ' --END-- ' :
print ' Image list was completed in the previous session '
else :
print ' Image list is incomplete. Resuming... '
images = images [ : - 1 ] #removing last one, next line append from start, and start is inclusive
images + = getImageFilenamesURL ( config = config , start = ' ! ' ) #fix, develop start when using API, if using special:imagelist ignore start and reload all
saveImageFilenamesURL ( config = config , images = images )
#checking images directory
listdir = [ ]
try :
listdir = os . listdir ( ' %s /images ' % ( config [ ' path ' ] ) )
except :
pass #probably directory does not exist
listdir . sort ( )
complete = True
lastfilename = ' '
lastfilename2 = ' '
2011-04-07 22:35:25 +00:00
c = 0
2011-04-08 14:57:36 +00:00
for filename , url , uploader in images :
2011-04-07 22:32:05 +00:00
if filename not in listdir :
complete = False
lastfilename2 = lastfilename
lastfilename = filename
break
2011-04-07 22:35:25 +00:00
c + = 1
print ' %d images were found in the directory from a previous session ' % ( c )
2011-04-07 22:32:05 +00:00
lastfilename2 = lastfilename # we resume from previous image, which may be corrupted by the previous session ctrl-c or abort
if complete :
#image dump is complete
print ' Image dump was completed in the previous session '
else :
generateImageDump ( config = config , images = images , start = lastfilename )
2011-04-07 20:24:30 +00:00
if config [ ' logs ' ] :
2011-04-07 20:38:06 +00:00
#fix
2011-04-07 13:14:37 +00:00
pass
else :
2011-04-07 20:24:30 +00:00
print ' Trying generating a new dump into a new directory... '
2011-04-07 15:43:17 +00:00
if config [ ' xml ' ] :
2011-04-07 22:32:05 +00:00
titles + = getPageTitles ( config = config , start = ' ! ' )
2011-04-07 20:24:30 +00:00
saveTitles ( config = config , titles = titles )
2011-04-07 15:43:17 +00:00
generateXMLDump ( config = config , titles = titles )
2011-04-07 13:14:37 +00:00
if config [ ' images ' ] :
2011-04-07 22:32:05 +00:00
images + = getImageFilenamesURL ( config = config ) #fix add start like above
saveImageFilenamesURL ( config = config , images = images )
generateImageDump ( config = config , images = images )
2011-04-07 13:14:37 +00:00
if config [ ' logs ' ] :
2011-04-07 15:43:17 +00:00
saveLogs ( config = config )
2011-04-07 16:10:12 +00:00
bye ( config = config )
2011-04-07 15:43:17 +00:00
if __name__ == " __main__ " :
main ( )