diff --git a/wikiteam3/dumpgenerator/dump/image/html_regexs.py b/wikiteam3/dumpgenerator/dump/image/html_regexs.py
new file mode 100644
index 0000000..4290397
--- /dev/null
+++ b/wikiteam3/dumpgenerator/dump/image/html_regexs.py
@@ -0,0 +1,28 @@
+REGEX_CANDIDATES = [
+
+ # archiveteam 1.15.1
+ # wikanda 1.15.5
+ r'(?im)\s*'
+
+ # wikijuegos 1.9.5
+ # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
+ # mediawiki version
+ ,r'(?im)\s*\s*\s*'
+
+ # gentoowiki 1.18
+ ,r'(?im)'
+
+ # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
+ # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
+ ,'(?ism)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)'
+
+ ,(
+ r'(?im)\s*'
+ r'\s*'
+ r'\s*'
+ r''
+ )
+]
diff --git a/wikiteam3/dumpgenerator/dump/image/image.py b/wikiteam3/dumpgenerator/dump/image/image.py
index a968ef4..0c0cbff 100644
--- a/wikiteam3/dumpgenerator/dump/image/image.py
+++ b/wikiteam3/dumpgenerator/dump/image/image.py
@@ -2,9 +2,10 @@ import os
import re
import sys
import urllib.parse
-from typing import *
+from typing import Dict, Iterable, List
from wikiteam3.dumpgenerator.cli import Delay
+from wikiteam3.dumpgenerator.dump.image.html_regexs import REGEX_CANDIDATES
from wikiteam3.utils import domain2prefix
from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSizeError
from wikiteam3.dumpgenerator.api import getJSON
@@ -15,6 +16,7 @@ from wikiteam3.utils import sha1File
from wikiteam3.utils import cleanHTML, undoHTMLEntities
from wikiteam3.dumpgenerator.config import Config
+
class Image:
def getXMLFileDesc(config: Config=None, title="", session=None):
"""Get XML for image description page"""
@@ -241,39 +243,17 @@ class Image:
break
raw = cleanHTML(raw)
- # archiveteam 1.15.1
- # wikanda 1.15.5
- r_images1 = r'(?im)\s*'
- # wikijuegos 1.9.5
- # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
- # mediawiki version
- r_images2 = r'(?im)\s*\s*\s*'
- # gentoowiki 1.18
- r_images3 = r'(?im)'
- # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
- # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
- r_images4 = '(?im)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)'
- r_images5 = (
- r'(?im)\s*'
- r'\s*'
- r'\s*'
- r''
- )
# Select the regexp that returns more results
- regexps = [r_images1, r_images2, r_images3, r_images4, r_images5]
- count = 0
- i = 0
- regexp_best = 0
- for regexp in regexps:
- if len(re.findall(regexp, raw)) > count:
- count = len(re.findall(regexp, raw))
- regexp_best = i
- i += 1
- m = re.compile(regexps[regexp_best]).finditer(raw)
+ best_matched = 0
+ regexp_best = None
+ for regexp in REGEX_CANDIDATES:
+ _count = len(re.findall(regexp, raw))
+ if _count > best_matched:
+ best_matched = _count
+ regexp_best = regexp
+ assert regexp_best is not None, "Could not find a proper regexp to parse the HTML"
+ m = re.compile(regexp_best).finditer(raw)
# Iter the image results
for i in m: