Compare commits

...

11 Commits

Author SHA1 Message Date
nemobis c7150784c1
Merge pull request #452 from yzqzss/patch-4
Update dumpgenerator.py
12 months ago
nemobis a977dc1a8b
Merge pull request #439 from Pokechu22/page-title-scraper-fix
Fix infinite loop on page title scraper
12 months ago
nemobis c56cbf1c12
Merge pull request #453 from yzqzss/patch-5
Speed up file scanning in `images/` dir
12 months ago
nemobis 674381c27c
Merge pull request #448 from yzqzss/patch-1
Match single quotes too when scraping namespaces
12 months ago
nemobis 8167987052
Merge pull request #451 from yzqzss/patch-2
Quote `title` to get correct file description
12 months ago
yzqzss 392fbce083
speed up file scanning
use `set` instead of `list` to speed up the scanning of large numbers of files (>10000) in `images/`.
1 year ago
yzqzss 940d50bbac
Update dumpgenerator.py
fix typo
1 year ago
yzqzss ebac66f557
Update dumpgenerator.py 1 year ago
yzqzss 0be46c7427
quote `title` 1 year ago
yzqzss 331f8e122b
update regex to match `'` and `"` in <option> tag
the new versions of MediaWiki use `'`, older use `"`.
1 year ago
Pokechu22 a1bd3b0851 Fix infinite loop on page title scraper 2 years ago

@ -170,7 +170,7 @@ def getNamespacesScraper(config={}, session=None):
# [^>]*? to include selected="selected"
m = re.compile(
r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw)
r'<option [^>]*?value=[\'"](?P<namespaceid>\d+)[\'"][^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw)
if 'all' in namespaces:
namespaces = []
for i in m:
@ -287,9 +287,9 @@ def getPageTitlesScraper(config={}, session=None):
r_title = r'title="(?P<title>[^>]+)">'
r_suballpages = ''
r_suballpages1 = r'&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">'
r_suballpages3 = r'&amp;from=(?P<from>[^>]+)" title="[^>]+">'
r_suballpages1 = r'&amp;from=(?P<from>[^>"]+)&amp;to=(?P<to>[^>"]+)">'
r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">'
r_suballpages3 = r'&amp;from=(?P<from>[^>"]+)" title="[^>]+">'
if re.search(r_suballpages1, raw):
r_suballpages = r_suballpages1
elif re.search(r_suballpages2, raw):
@ -299,7 +299,7 @@ def getPageTitlesScraper(config={}, session=None):
else:
pass # perhaps no subpages
# Should be enought subpages on Special:Allpages
# Should be enough subpages on Special:Allpages
deep = 50
c = 0
oldfr = ''
@ -321,8 +321,8 @@ def getPageTitlesScraper(config={}, session=None):
name = '%s-%s' % (fr, to)
url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
config['index'], namespace, fr, to) # do not put urllib.quote in fr or to
# fix, esta regexp no carga bien todas? o falla el r_title en
# este tipo de subpag? (wikiindex)
# fix, this regexp doesn't properly save everything? or does r_title fail on this
# type of subpage? (wikiindex)
elif r_suballpages == r_suballpages2:
# clean &amp;namespace=\d, sometimes happens
fr = fr.split('&amp;namespace=')[0]
@ -1519,7 +1519,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
title = u'Image:%s' % (filename)
try:
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % urllib.parse.quote(title))
xmlfiledesc = r.text
else:
xmlfiledesc = getXMLFileDesc(
@ -2250,7 +2250,7 @@ def resumePreviousDump(config={}, other={}):
listdir = os.listdir('%s/images' % (config['path']))
except:
pass # probably directory does not exist
listdir.sort()
listdir = set(listdir)
complete = True
lastfilename = ''
lastfilename2 = ''
@ -2528,7 +2528,7 @@ def main(params=[]):
print '\nWarning!: "%s" path exists' % (config['path'])
reply = ''
if config['failfast']:
retry = 'yes'
reply = 'yes'
while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input(
'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %

Loading…
Cancel
Save