refactor: reseparate logic and regex data

11 months ago · 0b8eff6c1f
parent eb1529a4c1
commit 0b8eff6c1f
2 changed files with 40 additions and 32 deletions
--- a/wikiteam3/dumpgenerator/dump/image/html_regexs.py
+++ b/wikiteam3/dumpgenerator/dump/image/html_regexs.py
@ -0,0 +1,28 @@
+REGEX_CANDIDATES = [
+
+    # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
+    # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
+    # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
+    # class="new" title="Usuario:Fernandocg (página no
+    # existe)">Fernandocg</a></td>
+    r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
+
+    # wikijuegos 1.9.5
+    # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
+    # mediawiki version
+    ,r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
+
+    # gentoowiki 1.18
+    ,r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
+
+    # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
+    # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
+    ,'(?ism)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
+
+    ,(
+        r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
+        r'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
+        r'<td class="TablePager_col_img_size">[^<]*?</td>\s*'
+        r'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>'
+    )
+]
--- a/wikiteam3/dumpgenerator/dump/image/image.py
+++ b/wikiteam3/dumpgenerator/dump/image/image.py
@ -2,9 +2,10 @@ import os
 import re
 import sys
 import urllib.parse
-from typing import *
+from typing import Dict, Iterable, List

 from wikiteam3.dumpgenerator.cli import Delay
+from wikiteam3.dumpgenerator.dump.image.html_regexs import REGEX_CANDIDATES
 from wikiteam3.utils import domain2prefix
 from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSizeError
 from wikiteam3.dumpgenerator.api import getJSON
@ -15,6 +16,7 @@ from wikiteam3.utils import sha1File
 from wikiteam3.utils import cleanHTML, undoHTMLEntities
 from wikiteam3.dumpgenerator.config import Config

+
 class Image:
    def getXMLFileDesc(config: Config=None, title="", session=None):
        """Get XML for image description page"""
@ -241,39 +243,17 @@ class Image:
                    break

            raw = cleanHTML(raw)
-            # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
-            # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
-            # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
-            # class="new" title="Usuario:Fernandocg (página no
-            # existe)">Fernandocg</a></td>
-            r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
-            # wikijuegos 1.9.5
-            # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
-            # mediawiki version
-            r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
-            # gentoowiki 1.18
-            r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
-            # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
-            # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
-            r_images4 = '(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
-            r_images5 = (
-                r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
-                r'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
-                r'<td class="TablePager_col_img_size">[^<]*?</td>\s*'
-                r'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>'
-            )

            # Select the regexp that returns more results
-            regexps = [r_images1, r_images2, r_images3, r_images4, r_images5]
-            count = 0
-            i = 0
-            regexp_best = 0
-            for regexp in regexps:
-                if len(re.findall(regexp, raw)) > count:
-                    count = len(re.findall(regexp, raw))
-                    regexp_best = i
-                i += 1
-            m = re.compile(regexps[regexp_best]).finditer(raw)
+            best_matched = 0
+            regexp_best = None
+            for regexp in REGEX_CANDIDATES:
+                _count = len(re.findall(regexp, raw))
+                if _count > best_matched:
+                    best_matched = _count
+                    regexp_best = regexp
+            assert regexp_best is not None, "Could not find a proper regexp to parse the HTML"
+            m = re.compile(regexp_best).finditer(raw)

            # Iter the image results
            for i in m: