mirror of
https://github.com/koreader/koreader
synced 2024-11-10 01:10:34 +00:00
Wikipedia Save as EPUB: various encoding fixes (#3851)
* Wiki Save as EPUB: various encoding fixes Fix display of & in article titles Fix display of &, <, > in TOC entries and in targeted anchor (the mismatch with the target id made these TOC entries invalid and simply not displayed). Remove percent-encoded URLs tweaks for crengine now that crengine correctly supports them (each percent encode handled as an UTF8 byte). Bump crengine for that. Don't include <meta name="cover"> when no cover present. * bump base/crengine
This commit is contained in:
parent
3585067796
commit
305e75c5ea
2
base
2
base
@ -1 +1 @@
|
||||
Subproject commit 3f0084f1d7457941303a29c576268944c47df071
|
||||
Subproject commit 44d4474779a9fcd66e5e265203b82bc35830f56f
|
@ -668,6 +668,12 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
||||
local html = phtml.text["*"] -- html content
|
||||
local page_cleaned = page:gsub("_", " ") -- page title
|
||||
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
|
||||
-- We need to encode plain '&' in those so we can put them in XML/HTML
|
||||
-- We wouldn't need to escape as many as util.htmlEntitiesToUtf8() does, but
|
||||
-- we need to to not mess existing ones (' ' may happen) with our '&'
|
||||
-- encodes. (We don't escape < or > as these JSON strings may contain HTML tags)
|
||||
page_cleaned = util.htmlEntitiesToUtf8(page_cleaned):gsub("&", "&")
|
||||
page_htmltitle = util.htmlEntitiesToUtf8(page_htmltitle):gsub("&", "&")
|
||||
local sections = phtml.sections -- Wikipedia provided TOC
|
||||
local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
|
||||
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
||||
@ -678,7 +684,7 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
||||
local images = {}
|
||||
local seen_images = {}
|
||||
local imagenum = 1
|
||||
local cover_imgid = "" -- best candidate for cover among our images
|
||||
local cover_imgid = nil -- best candidate for cover among our images
|
||||
local processImg = function(img_tag)
|
||||
local src = img_tag:match([[src="([^"]*)"]])
|
||||
if src == nil or src == "" then
|
||||
@ -753,7 +759,7 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
||||
table.insert(images, cur_image)
|
||||
seen_images[src] = cur_image
|
||||
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
|
||||
if cover_imgid == "" and width and width > 50 and height and height > 50 and height > width then
|
||||
if not cover_imgid and width and width > 50 and height and height > 50 and height > width then
|
||||
cover_imgid = imgid
|
||||
end
|
||||
imagenum = imagenum + 1
|
||||
@ -847,6 +853,10 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
||||
end
|
||||
local content_opf_parts = {}
|
||||
-- head
|
||||
local meta_cover = "<!-- no cover image -->"
|
||||
if include_images and cover_imgid then
|
||||
meta_cover = string.format([[<meta name="cover" content="%s"/>]], cover_imgid)
|
||||
end
|
||||
table.insert(content_opf_parts, string.format([[
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<package xmlns="http://www.idpf.org/2007/opf"
|
||||
@ -858,13 +868,13 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
||||
<dc:identifier id="bookid">%s</dc:identifier>
|
||||
<dc:language>%s</dc:language>
|
||||
<dc:publisher>%s</dc:publisher>
|
||||
<meta name="cover" content="%s"/>
|
||||
%s
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
||||
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
|
||||
<item id="css" href="stylesheet.css" media-type="text/css"/>
|
||||
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, cover_imgid))
|
||||
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, meta_cover))
|
||||
-- images files
|
||||
if include_images then
|
||||
for inum, img in ipairs(images) do
|
||||
@ -970,9 +980,17 @@ time, abbr, sup {
|
||||
-- Wikipedia sections items seem to be already sorted by index, so no need to sort
|
||||
for isec, s in ipairs(sections) do
|
||||
num = num + 1
|
||||
local s_anchor = s.anchor
|
||||
-- Some chars in headings are converted to html entities in the
|
||||
-- wikipedia-generated HTML. We need to do the same in TOC links
|
||||
-- for the links to be valid.
|
||||
local s_anchor = s.anchor:gsub("&", "&"):gsub('"', """):gsub(">", ">"):gsub("<", "<")
|
||||
local s_title = string.format("%s %s", s.number, s.line)
|
||||
s_title = (s_title:gsub("(%b<>)", "")) -- titles may include <i> and other html tags
|
||||
-- Titles may include <i> and other html tags: let's remove them as
|
||||
-- our TOC can only display text
|
||||
s_title = (s_title:gsub("(%b<>)", ""))
|
||||
-- We need to do as for page_htmltitle above. But headings can contain
|
||||
-- html entities for < and > that we need to put back as html entities
|
||||
s_title = util.htmlEntitiesToUtf8(s_title):gsub("&", "&"):gsub(">", ">"):gsub("<", "<")
|
||||
local s_level = s.toclevel
|
||||
if s_level > depth then
|
||||
depth = s_level -- max depth required in toc.ncx
|
||||
@ -1073,31 +1091,21 @@ time, abbr, sup {
|
||||
|
||||
-- Fix internal wikipedia links with full server url (including lang) so
|
||||
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
|
||||
-- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]])
|
||||
--
|
||||
-- Also, crengine deals strangely with percent encoded utf8 :
|
||||
-- if the link in the html is : <a href="http://fr.wikipedia.org/wiki/Fran%C3%A7oix">
|
||||
-- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Fran____oix
|
||||
-- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8,
|
||||
-- when we should have get "\xc3\xa7" ...
|
||||
-- We can avoid that by putting in the url plain unencoded UTF8
|
||||
local hex_to_char = function(x) return string.char(tonumber(x, 16)) end
|
||||
-- We need to remove any "?somekey=somevalue" from url (a real "?" part of the
|
||||
-- wiki_page word would be encoded as %3F, but ReaderLink would get it decoded and
|
||||
-- would not be able to distinguish them).
|
||||
-- Do that first (need to be done first) for full links to other language wikipedias
|
||||
local fixEncodedOtherLangWikiPageTitle = function(wiki_lang, wiki_page)
|
||||
-- First, remove any "?otherkey=othervalue" from url (a real "?" part of the wiki_page word
|
||||
-- would be encoded as %3f), that could cause problem when used.
|
||||
local cleanOtherLangWikiPageTitle = function(wiki_lang, wiki_page)
|
||||
wiki_page = wiki_page:gsub("%?.*", "")
|
||||
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
|
||||
return string.format([[href="https://%s.wikipedia.org/wiki/%s"]], wiki_lang, wiki_page)
|
||||
end
|
||||
html = html:gsub([[href="https?://([^%.]+).wikipedia.org/wiki/([^"]*)"]], fixEncodedOtherLangWikiPageTitle)
|
||||
html = html:gsub([[href="https?://([^%.]+).wikipedia.org/wiki/([^"]*)"]], cleanOtherLangWikiPageTitle)
|
||||
-- Now, do it for same wikipedia short urls
|
||||
local fixEncodedWikiPageTitle = function(wiki_page)
|
||||
local cleanWikiPageTitle = function(wiki_page)
|
||||
wiki_page = wiki_page:gsub("%?.*", "")
|
||||
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
|
||||
return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page)
|
||||
end
|
||||
html = html:gsub([[href="/wiki/([^"]*)"]], fixEncodedWikiPageTitle)
|
||||
html = html:gsub([[href="/wiki/([^"]*)"]], cleanWikiPageTitle)
|
||||
|
||||
-- Remove href from links to non existant wiki page so they are not clickable :
|
||||
-- <a href="/w/index.php?title=PageTitle&action=edit&redlink=1" class="new" title="PageTitle">PageTitle____on</a>
|
||||
|
@ -529,15 +529,16 @@ function util.unicodeCodepointToUtf8(c)
|
||||
end
|
||||
end
|
||||
|
||||
-- we need to use an array of arrays to keep them ordered as written
|
||||
local HTML_ENTITIES_TO_UTF8 = {
|
||||
["<"] = "<",
|
||||
[">"] = ">",
|
||||
["""] = '"',
|
||||
["'"] = "'",
|
||||
[" "] = "\xC2\xA0",
|
||||
["&#(%d+);"] = function(x) return util.unicodeCodepointToUtf8(tonumber(x)) end,
|
||||
["&#x(%x+);"] = function(x) return util.unicodeCodepointToUtf8(tonumber(x,16)) end,
|
||||
["&"] = "&", -- must be last
|
||||
{"<", "<"},
|
||||
{">", ">"},
|
||||
{""", '"'},
|
||||
{"'", "'"},
|
||||
{" ", "\xC2\xA0"},
|
||||
{"&#(%d+);", function(x) return util.unicodeCodepointToUtf8(tonumber(x)) end},
|
||||
{"&#x(%x+);", function(x) return util.unicodeCodepointToUtf8(tonumber(x,16)) end},
|
||||
{"&", "&"}, -- must be last
|
||||
}
|
||||
--- Replace HTML entities with their UTF8 equivalent in text
|
||||
--
|
||||
@ -546,8 +547,8 @@ local HTML_ENTITIES_TO_UTF8 = {
|
||||
--- @int string text with HTML entities
|
||||
--- @treturn string UTF8 text
|
||||
function util.htmlEntitiesToUtf8(text)
|
||||
for k,v in pairs(HTML_ENTITIES_TO_UTF8) do
|
||||
text = text:gsub(k, v)
|
||||
for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do
|
||||
text = text:gsub(t[1], t[2])
|
||||
end
|
||||
return text
|
||||
end
|
||||
|
Loading…
Reference in New Issue
Block a user