2
0
mirror of https://github.com/koreader/koreader synced 2024-11-10 01:10:34 +00:00

Wikipedia Save as EPUB: various encoding fixes (#3851)

* Wiki Save as EPUB: various encoding fixes

Fix display of & in article titles
Fix display of &, <, > in TOC entries and in targeted anchor (the mismatch
with the target id made these TOC entries invalid and simply not displayed).
Remove percent-encoded URLs tweaks for crengine now that crengine
correctly supports them (each percent encode handled as an UTF8 byte).
Bump crengine for that.
Don't include <meta name="cover"> when no cover present.

* bump base/crengine
This commit is contained in:
poire-z 2018-04-10 18:30:27 +02:00 committed by Frans de Jonge
parent 3585067796
commit 305e75c5ea
3 changed files with 43 additions and 34 deletions

2
base

@ -1 +1 @@
Subproject commit 3f0084f1d7457941303a29c576268944c47df071 Subproject commit 44d4474779a9fcd66e5e265203b82bc35830f56f

View File

@ -668,6 +668,12 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
local html = phtml.text["*"] -- html content local html = phtml.text["*"] -- html content
local page_cleaned = page:gsub("_", " ") -- page title local page_cleaned = page:gsub("_", " ") -- page title
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
-- We need to encode plain '&' in those so we can put them in XML/HTML
-- We wouldn't need to escape as many as util.htmlEntitiesToUtf8() does, but
-- we need to to not mess existing ones ('&nbsp;' may happen) with our '&'
-- encodes. (We don't escape < or > as these JSON strings may contain HTML tags)
page_cleaned = util.htmlEntitiesToUtf8(page_cleaned):gsub("&", "&#38;")
page_htmltitle = util.htmlEntitiesToUtf8(page_htmltitle):gsub("&", "&#38;")
local sections = phtml.sections -- Wikipedia provided TOC local sections = phtml.sections -- Wikipedia provided TOC
local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid) local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is, -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
@ -678,7 +684,7 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
local images = {} local images = {}
local seen_images = {} local seen_images = {}
local imagenum = 1 local imagenum = 1
local cover_imgid = "" -- best candidate for cover among our images local cover_imgid = nil -- best candidate for cover among our images
local processImg = function(img_tag) local processImg = function(img_tag)
local src = img_tag:match([[src="([^"]*)"]]) local src = img_tag:match([[src="([^"]*)"]])
if src == nil or src == "" then if src == nil or src == "" then
@ -753,7 +759,7 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
table.insert(images, cur_image) table.insert(images, cur_image)
seen_images[src] = cur_image seen_images[src] = cur_image
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image -- Use first image of reasonable size (not an icon) and portrait-like as cover-image
if cover_imgid == "" and width and width > 50 and height and height > 50 and height > width then if not cover_imgid and width and width > 50 and height and height > 50 and height > width then
cover_imgid = imgid cover_imgid = imgid
end end
imagenum = imagenum + 1 imagenum = imagenum + 1
@ -847,6 +853,10 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
end end
local content_opf_parts = {} local content_opf_parts = {}
-- head -- head
local meta_cover = "<!-- no cover image -->"
if include_images and cover_imgid then
meta_cover = string.format([[<meta name="cover" content="%s"/>]], cover_imgid)
end
table.insert(content_opf_parts, string.format([[ table.insert(content_opf_parts, string.format([[
<?xml version='1.0' encoding='utf-8'?> <?xml version='1.0' encoding='utf-8'?>
<package xmlns="http://www.idpf.org/2007/opf" <package xmlns="http://www.idpf.org/2007/opf"
@ -858,13 +868,13 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
<dc:identifier id="bookid">%s</dc:identifier> <dc:identifier id="bookid">%s</dc:identifier>
<dc:language>%s</dc:language> <dc:language>%s</dc:language>
<dc:publisher>%s</dc:publisher> <dc:publisher>%s</dc:publisher>
<meta name="cover" content="%s"/> %s
</metadata> </metadata>
<manifest> <manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/> <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="content" href="content.html" media-type="application/xhtml+xml"/> <item id="content" href="content.html" media-type="application/xhtml+xml"/>
<item id="css" href="stylesheet.css" media-type="text/css"/> <item id="css" href="stylesheet.css" media-type="text/css"/>
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, cover_imgid)) ]], page_cleaned, lang:upper(), bookid, lang, koreader_version, meta_cover))
-- images files -- images files
if include_images then if include_images then
for inum, img in ipairs(images) do for inum, img in ipairs(images) do
@ -970,9 +980,17 @@ time, abbr, sup {
-- Wikipedia sections items seem to be already sorted by index, so no need to sort -- Wikipedia sections items seem to be already sorted by index, so no need to sort
for isec, s in ipairs(sections) do for isec, s in ipairs(sections) do
num = num + 1 num = num + 1
local s_anchor = s.anchor -- Some chars in headings are converted to html entities in the
-- wikipedia-generated HTML. We need to do the same in TOC links
-- for the links to be valid.
local s_anchor = s.anchor:gsub("&", "&amp;"):gsub('"', "&quot;"):gsub(">", "&gt;"):gsub("<", "&lt;")
local s_title = string.format("%s %s", s.number, s.line) local s_title = string.format("%s %s", s.number, s.line)
s_title = (s_title:gsub("(%b<>)", "")) -- titles may include <i> and other html tags -- Titles may include <i> and other html tags: let's remove them as
-- our TOC can only display text
s_title = (s_title:gsub("(%b<>)", ""))
-- We need to do as for page_htmltitle above. But headings can contain
-- html entities for < and > that we need to put back as html entities
s_title = util.htmlEntitiesToUtf8(s_title):gsub("&", "&#38;"):gsub(">", "&gt;"):gsub("<", "&lt;")
local s_level = s.toclevel local s_level = s.toclevel
if s_level > depth then if s_level > depth then
depth = s_level -- max depth required in toc.ncx depth = s_level -- max depth required in toc.ncx
@ -1073,31 +1091,21 @@ time, abbr, sup {
-- Fix internal wikipedia links with full server url (including lang) so -- Fix internal wikipedia links with full server url (including lang) so
-- ReaderLink can notice them and deal with them with a LookupWikipedia event. -- ReaderLink can notice them and deal with them with a LookupWikipedia event.
-- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]]) -- We need to remove any "?somekey=somevalue" from url (a real "?" part of the
-- -- wiki_page word would be encoded as %3F, but ReaderLink would get it decoded and
-- Also, crengine deals strangely with percent encoded utf8 : -- would not be able to distinguish them).
-- if the link in the html is : <a href="http://fr.wikipedia.org/wiki/Fran%C3%A7oix">
-- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Fran____oix
-- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8,
-- when we should have get "\xc3\xa7" ...
-- We can avoid that by putting in the url plain unencoded UTF8
local hex_to_char = function(x) return string.char(tonumber(x, 16)) end
-- Do that first (need to be done first) for full links to other language wikipedias -- Do that first (need to be done first) for full links to other language wikipedias
local fixEncodedOtherLangWikiPageTitle = function(wiki_lang, wiki_page) local cleanOtherLangWikiPageTitle = function(wiki_lang, wiki_page)
-- First, remove any "?otherkey=othervalue" from url (a real "?" part of the wiki_page word
-- would be encoded as %3f), that could cause problem when used.
wiki_page = wiki_page:gsub("%?.*", "") wiki_page = wiki_page:gsub("%?.*", "")
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
return string.format([[href="https://%s.wikipedia.org/wiki/%s"]], wiki_lang, wiki_page) return string.format([[href="https://%s.wikipedia.org/wiki/%s"]], wiki_lang, wiki_page)
end end
html = html:gsub([[href="https?://([^%.]+).wikipedia.org/wiki/([^"]*)"]], fixEncodedOtherLangWikiPageTitle) html = html:gsub([[href="https?://([^%.]+).wikipedia.org/wiki/([^"]*)"]], cleanOtherLangWikiPageTitle)
-- Now, do it for same wikipedia short urls -- Now, do it for same wikipedia short urls
local fixEncodedWikiPageTitle = function(wiki_page) local cleanWikiPageTitle = function(wiki_page)
wiki_page = wiki_page:gsub("%?.*", "") wiki_page = wiki_page:gsub("%?.*", "")
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page) return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page)
end end
html = html:gsub([[href="/wiki/([^"]*)"]], fixEncodedWikiPageTitle) html = html:gsub([[href="/wiki/([^"]*)"]], cleanWikiPageTitle)
-- Remove href from links to non existant wiki page so they are not clickable : -- Remove href from links to non existant wiki page so they are not clickable :
-- <a href="/w/index.php?title=PageTitle&amp;action=edit&amp;redlink=1" class="new" title="PageTitle">PageTitle____on</a> -- <a href="/w/index.php?title=PageTitle&amp;action=edit&amp;redlink=1" class="new" title="PageTitle">PageTitle____on</a>

View File

@ -529,15 +529,16 @@ function util.unicodeCodepointToUtf8(c)
end end
end end
-- we need to use an array of arrays to keep them ordered as written
local HTML_ENTITIES_TO_UTF8 = { local HTML_ENTITIES_TO_UTF8 = {
["&lt;"] = "<", {"&lt;", "<"},
["&gt;"] = ">", {"&gt;", ">"},
["&quot;"] = '"', {"&quot;", '"'},
["&apos;"] = "'", {"&apos;", "'"},
["&nbsp;"] = "\xC2\xA0", {"&nbsp;", "\xC2\xA0"},
["&#(%d+);"] = function(x) return util.unicodeCodepointToUtf8(tonumber(x)) end, {"&#(%d+);", function(x) return util.unicodeCodepointToUtf8(tonumber(x)) end},
["&#x(%x+);"] = function(x) return util.unicodeCodepointToUtf8(tonumber(x,16)) end, {"&#x(%x+);", function(x) return util.unicodeCodepointToUtf8(tonumber(x,16)) end},
["&amp;"] = "&", -- must be last {"&amp;", "&"}, -- must be last
} }
--- Replace HTML entities with their UTF8 equivalent in text --- Replace HTML entities with their UTF8 equivalent in text
-- --
@ -546,8 +547,8 @@ local HTML_ENTITIES_TO_UTF8 = {
--- @int string text with HTML entities --- @int string text with HTML entities
--- @treturn string UTF8 text --- @treturn string UTF8 text
function util.htmlEntitiesToUtf8(text) function util.htmlEntitiesToUtf8(text)
for k,v in pairs(HTML_ENTITIES_TO_UTF8) do for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do
text = text:gsub(k, v) text = text:gsub(t[1], t[2])
end end
return text return text
end end