Wikipedia: handle images in changed Wikipedia HTML

The HTML we get from Wikipedia has recently changed,
which has caused images to not be detected and shown
in DictQuickLookup, and to be missing or ugly in
saved EPUBs. Update our code and CSS to display them
again.
reviewable/pr10700/r3^2
poire-z 11 months ago
parent eecdac930e
commit 4fa278ff0b

@ -58,6 +58,7 @@ local Wikipedia = {
prop = "text|sections|displaytitle|revid", prop = "text|sections|displaytitle|revid",
-- page = nil, -- text to lookup, will be added below -- page = nil, -- text to lookup, will be added below
-- disabletoc = "", -- if we want to remove toc IN html -- disabletoc = "", -- if we want to remove toc IN html
-- 20230722: there is no longer the TOC in the html no matter this param
disablelimitreport = "", disablelimitreport = "",
disableeditsection = "", disableeditsection = "",
}, },
@ -306,14 +307,15 @@ function Wikipedia:getFullPageImages(wiki_title, lang)
local wiki_base_url = self:getWikiServer(lang) local wiki_base_url = self:getWikiServer(lang)
local thumbs = {} -- bits of HTML containing an image local thumbs = {} -- bits of HTML containing an image
-- We first try to catch images in <div class=thumbinner>, which should exclude -- We first try to catch images in <figure>, which should exclude
-- wikipedia icons, flags... These seem to all end with a double </div>. -- wikipedia icons, flags...
for thtml in html:gmatch([[<div class="thumbinner".-</div>%s*</div>]]) do -- (We want to match both typeof="mw:File/Thumb" and typeof="mw:File/Frame", so this [TF][hr][ua]m[be]...
for thtml in html:gmatch([[<figure [^>]*typeof="mw:File/[TF][hr][ua]m[be]"[^>]*>.-</figure>]]) do
table.insert(thumbs, thtml) table.insert(thumbs, thtml)
end end
-- We then also try to catch images in galleries (which often are less -- We then also try to catch images in galleries (which often are less
-- interesting than those in thumbinner) as a 2nd set. -- interesting than those in thumbinner) as a 2nd set.
for thtml in html:gmatch([[<li class="gallerybox".-<div class="thumb".-</div>%s*</div>%s*<div class="gallerytext">.-</div>%s*</div>]]) do for thtml in html:gmatch([[<li class="gallerybox".-<div class="thumb".-</div>%s*<div class="gallerytext">.-</div>]]) do
table.insert(thumbs, thtml) table.insert(thumbs, thtml)
end end
-- We may miss some interesting images in the page's top right table, but -- We may miss some interesting images in the page's top right table, but
@ -322,7 +324,7 @@ function Wikipedia:getFullPageImages(wiki_title, lang)
for _, thtml in ipairs(thumbs) do for _, thtml in ipairs(thumbs) do
-- We get <a href="/wiki/File:real_file_name.jpg (or /wiki/Fichier:real_file_name.jpg -- We get <a href="/wiki/File:real_file_name.jpg (or /wiki/Fichier:real_file_name.jpg
-- depending on Wikipedia lang) -- depending on Wikipedia lang)
local filename = thtml:match([[<a href="/wiki/[^:]*:([^"]*)" class="image"]]) local filename = thtml:match([[<a href="/wiki/[^:]*:([^"]*)" class="mw.file.description"]])
if filename then if filename then
filename = url.unescape(filename) filename = url.unescape(filename)
end end
@ -975,17 +977,36 @@ ol.references {
} }
/* Show a box around image thumbnails */ /* Show a box around image thumbnails */
div.thumb { figure[typeof~='mw:File/Thumb'],
figure[typeof~='mw:File/Frame'] {
display: table;
border: dotted 1px black; border: dotted 1px black;
margin: 0.5em 2.5em 0.5em 2.5em; margin: 0.5em 2.5em 0.5em 2.5em;
padding: 0.5em 0.5em 0.2em 0.5em; padding: 0 0.5em 0 0.5em;
padding-top: ]].. (include_images and "0.5em" or "0.15em") .. [[; padding-top: ]].. (include_images and "0.5em" or "0") .. [[;
text-align: center; text-align: center;
font-size: 90%; font-size: 90%;
page-break-inside: avoid; page-break-inside: avoid;
-cr-only-if: float-floatboxes;
max-width: 50vw; /* ensure we never take half of screen width */
-cr-only-if: -float-floatboxes;
width: 100% !important;
-cr-only-if: legacy;
display: block;
}
figure[typeof~='mw:File/Thumb'] > figcaption,
figure[typeof~='mw:File/Frame'] > figcaption {
display: table-caption;
caption-side: bottom;
padding: 0.2em 0.5em 0.2em 0.5em;
/* No padding-top if image, as the image's strut brings some enough spacing */
padding-top: ]].. (include_images and "0" or "0.2em") .. [[;
-cr-only-if: legacy;
display: block;
} }
/* Allow main thumbnails to float */ /* Allow main thumbnails to float, preferably on the right */
body > div > div.thumb { body > div > figure[typeof~='mw:File/Thumb'],
body > div > figure[typeof~='mw:File/Frame'] {
float: right !important; float: right !important;
/* Change some of their styles when floating */ /* Change some of their styles when floating */
-cr-only-if: float-floatboxes; -cr-only-if: float-floatboxes;
@ -996,39 +1017,48 @@ body > div > div.thumb {
-cr-only-if: float-floatboxes -allow-style-w-h-absolute-units; -cr-only-if: float-floatboxes -allow-style-w-h-absolute-units;
width: 33% !important; width: 33% !important;
} }
body > div:dir(rtl) > div.thumb { /* invert if RTL */ /* invert if RTL */
body > div:dir(rtl) > figure[typeof~='mw:File/Thumb'],
body > div:dir(rtl) > figure[typeof~='mw:File/Frame'] {
float: left !important; float: left !important;
-cr-only-if: float-floatboxes; -cr-only-if: float-floatboxes;
clear: left; clear: left;
margin: 0 0.5em 0.2em 0 !important; margin: 0 0.5em 0.2em 0 !important;
} }
/* Allow original mix of left/right floats in web mode */ /* Allow original mix of left/right floats in web mode */
body > div > div.thumb.tleft { body > div > figure[typeof~='mw:File/Thumb'].mw-halign-left,
body > div > figure[typeof~='mw:File/Frame'].mw-halign-left {
-cr-only-if: float-floatboxes allow-style-w-h-absolute-units; -cr-only-if: float-floatboxes allow-style-w-h-absolute-units;
float: left !important; float: left !important;
clear: left; clear: left;
margin: 0 0.5em 0.2em 0 !important; margin: 0 0.5em 0.2em 0 !important;
} }
body > div > div.thumb.tright { body > div > figure[typeof~='mw:File/Thumb'].mw-halign-right,
body > div > figure[typeof~='mw:File/Frame'].mw-halign-right {
-cr-only-if: float-floatboxes allow-style-w-h-absolute-units; -cr-only-if: float-floatboxes allow-style-w-h-absolute-units;
float: right !important; float: right !important;
clear: right; clear: right;
margin: 0 0 0.2em 0.5em !important; margin: 0 0 0.2em 0.5em !important;
} }
body > div > figure[typeof~='mw:File/Thumb'] > img,
body > div > div.thumb img { body > div > figure[typeof~='mw:File/Frame'] > img {
/* Make float's inner images 100% of their container's width when not in "web" mode */ /* Make float's inner images 100% of their container's width when not in "web" mode */
-cr-only-if: float-floatboxes -allow-style-w-h-absolute-units; -cr-only-if: float-floatboxes -allow-style-w-h-absolute-units;
width: 100% !important; width: 100% !important;
height: 100% !important; height: 100% !important;
} }
/* For centered figure, we need to reset a few things, and to not
/* Some other (usually wide) thumbnails are wrapped in a DIV.center: * use display:table if we want them wide and centered */
* avoid having them overflowing in web mode (no issue in other modes). body > div > figure[typeof~='mw:File/Thumb'].mw-halign-center,
* (Use "width: auto", as crengine does not support "max-width:") */ body > div > figure[typeof~='mw:File/Frame'].mw-halign-center {
body > div > div.center > div.thumb * { display: block;
-cr-only-if: allow-style-w-h-absolute-units; float: none !important;
width: auto !important; margin: 0.5em 2.5em 0.5em 2.5em !important;
max-width: none;
}
body > div > figure[typeof~='mw:File/Thumb'].mw-halign-center > figcaption,
body > div > figure[typeof~='mw:File/Frame'].mw-halign-center > figcaption{
display: block;
} }
/* Style gallery and the galleryboxes it contains. /* Style gallery and the galleryboxes it contains.
@ -1081,6 +1111,7 @@ li.gallerybox {
-cr-only-if: -float-floatboxes; -cr-only-if: -float-floatboxes;
width: 100% !important; /* flat mode: force full width */ width: 100% !important; /* flat mode: force full width */
-cr-only-if: float-floatboxes; -cr-only-if: float-floatboxes;
font-size: 80%;
/* Remove our wide horizontal margins in book/web modes */ /* Remove our wide horizontal margins in book/web modes */
margin: 0.5em 0.5em 0.5em 0.5em !important; margin: 0.5em 0.5em 0.5em 0.5em !important;
-cr-only-if: float-floatboxes -allow-style-w-h-absolute-units; -cr-only-if: float-floatboxes -allow-style-w-h-absolute-units;
@ -1097,6 +1128,7 @@ li.gallerybox div.thumb {
border: solid 1px white; border: solid 1px white;
margin: 0; margin: 0;
padding: 0; padding: 0;
height: auto !important;
} }
li.gallerybox div.thumb div { li.gallerybox div.thumb div {
/* Override this one often set in style="" with various values */ /* Override this one often set in style="" with various values */

Loading…
Cancel
Save