refactor: reseparate logic and regex data

pull/475/head
yzqzss 11 months ago committed by yzqzss
parent eb1529a4c1
commit 0b8eff6c1f

@ -0,0 +1,28 @@
REGEX_CANDIDATES = [
# archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
# wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
# href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
# class="new" title="Usuario:Fernandocg (página no
# existe)">Fernandocg</a></td>
r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
# wikijuegos 1.9.5
# http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
# mediawiki version
,r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
# gentoowiki 1.18
,r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
# (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
,'(?ism)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
,(
r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
r'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
r'<td class="TablePager_col_img_size">[^<]*?</td>\s*'
r'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>'
)
]

@ -2,9 +2,10 @@ import os
import re
import sys
import urllib.parse
from typing import *
from typing import Dict, Iterable, List
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.dump.image.html_regexs import REGEX_CANDIDATES
from wikiteam3.utils import domain2prefix
from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSizeError
from wikiteam3.dumpgenerator.api import getJSON
@ -15,6 +16,7 @@ from wikiteam3.utils import sha1File
from wikiteam3.utils import cleanHTML, undoHTMLEntities
from wikiteam3.dumpgenerator.config import Config
class Image:
def getXMLFileDesc(config: Config=None, title="", session=None):
"""Get XML for image description page"""
@ -241,39 +243,17 @@ class Image:
break
raw = cleanHTML(raw)
# archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
# wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
# href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
# class="new" title="Usuario:Fernandocg (página no
# existe)">Fernandocg</a></td>
r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
# wikijuegos 1.9.5
# http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
# mediawiki version
r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
# gentoowiki 1.18
r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
# (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
r_images4 = '(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
r_images5 = (
r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
r'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
r'<td class="TablePager_col_img_size">[^<]*?</td>\s*'
r'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>'
)
# Select the regexp that returns more results
regexps = [r_images1, r_images2, r_images3, r_images4, r_images5]
count = 0
i = 0
regexp_best = 0
for regexp in regexps:
if len(re.findall(regexp, raw)) > count:
count = len(re.findall(regexp, raw))
regexp_best = i
i += 1
m = re.compile(regexps[regexp_best]).finditer(raw)
best_matched = 0
regexp_best = None
for regexp in REGEX_CANDIDATES:
_count = len(re.findall(regexp, raw))
if _count > best_matched:
best_matched = _count
regexp_best = regexp
assert regexp_best is not None, "Could not find a proper regexp to parse the HTML"
m = re.compile(regexp_best).finditer(raw)
# Iter the image results
for i in m:

Loading…
Cancel
Save