diff --git a/wikiteam3/dumpgenerator/dump/image/html_regexs.py b/wikiteam3/dumpgenerator/dump/image/html_regexs.py new file mode 100644 index 0000000..4290397 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/image/html_regexs.py @@ -0,0 +1,28 @@ +REGEX_CANDIDATES = [ + + # archiveteam 1.15.1 Yahoovideo.jpg (file) + # wikanda 1.15.5 Fernandocg + r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' + + # wikijuegos 1.9.5 + # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old + # mediawiki version + ,r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' + + # gentoowiki 1.18 + ,r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' + + # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= + # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
+ ,'(?ism)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' + + ,( + r'(?im)\s*]*?>(?P[^>]+)\s*\([^<]*?\s*\)\s*\s*' + r'[^\n\r]*?\s*' + r'[^<]*?\s*' + r'\s*()?(?P[^<]+?)()?\s*' + ) +] diff --git a/wikiteam3/dumpgenerator/dump/image/image.py b/wikiteam3/dumpgenerator/dump/image/image.py index a968ef4..0c0cbff 100644 --- a/wikiteam3/dumpgenerator/dump/image/image.py +++ b/wikiteam3/dumpgenerator/dump/image/image.py @@ -2,9 +2,10 @@ import os import re import sys import urllib.parse -from typing import * +from typing import Dict, Iterable, List from wikiteam3.dumpgenerator.cli import Delay +from wikiteam3.dumpgenerator.dump.image.html_regexs import REGEX_CANDIDATES from wikiteam3.utils import domain2prefix from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSizeError from wikiteam3.dumpgenerator.api import getJSON @@ -15,6 +16,7 @@ from wikiteam3.utils import sha1File from wikiteam3.utils import cleanHTML, undoHTMLEntities from wikiteam3.dumpgenerator.config import Config + class Image: def getXMLFileDesc(config: Config=None, title="", session=None): """Get XML for image description page""" @@ -241,39 +243,17 @@ class Image: break raw = cleanHTML(raw) - # archiveteam 1.15.1 Yahoovideo.jpg (file) - # wikanda 1.15.5 Fernandocg - r_images1 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' - # wikijuegos 1.9.5 - # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old - # mediawiki version - r_images2 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' - # gentoowiki 1.18 - r_images3 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' - # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
- r_images4 = '(?im)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' - r_images5 = ( - r'(?im)\s*]*?>(?P[^>]+)\s*\([^<]*?\s*\)\s*\s*' - r'[^\n\r]*?\s*' - r'[^<]*?\s*' - r'\s*()?(?P[^<]+?)()?\s*' - ) # Select the regexp that returns more results - regexps = [r_images1, r_images2, r_images3, r_images4, r_images5] - count = 0 - i = 0 - regexp_best = 0 - for regexp in regexps: - if len(re.findall(regexp, raw)) > count: - count = len(re.findall(regexp, raw)) - regexp_best = i - i += 1 - m = re.compile(regexps[regexp_best]).finditer(raw) + best_matched = 0 + regexp_best = None + for regexp in REGEX_CANDIDATES: + _count = len(re.findall(regexp, raw)) + if _count > best_matched: + best_matched = _count + regexp_best = regexp + assert regexp_best is not None, "Could not find a proper regexp to parse the HTML" + m = re.compile(regexp_best).finditer(raw) # Iter the image results for i in m: