mirror of
https://github.com/koreader/koreader
synced 2024-11-10 01:10:34 +00:00
Wikipedia Save as EPUB: various encoding fixes (#3851)
* Wiki Save as EPUB: various encoding fixes Fix display of & in article titles Fix display of &, <, > in TOC entries and in targeted anchor (the mismatch with the target id made these TOC entries invalid and simply not displayed). Remove percent-encoded URLs tweaks for crengine now that crengine correctly supports them (each percent encode handled as an UTF8 byte). Bump crengine for that. Don't include <meta name="cover"> when no cover present. * bump base/crengine
This commit is contained in:
parent
3585067796
commit
305e75c5ea
2
base
2
base
@ -1 +1 @@
|
|||||||
Subproject commit 3f0084f1d7457941303a29c576268944c47df071
|
Subproject commit 44d4474779a9fcd66e5e265203b82bc35830f56f
|
@ -668,6 +668,12 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
|||||||
local html = phtml.text["*"] -- html content
|
local html = phtml.text["*"] -- html content
|
||||||
local page_cleaned = page:gsub("_", " ") -- page title
|
local page_cleaned = page:gsub("_", " ") -- page title
|
||||||
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
|
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
|
||||||
|
-- We need to encode plain '&' in those so we can put them in XML/HTML
|
||||||
|
-- We wouldn't need to escape as many as util.htmlEntitiesToUtf8() does, but
|
||||||
|
-- we need to to not mess existing ones (' ' may happen) with our '&'
|
||||||
|
-- encodes. (We don't escape < or > as these JSON strings may contain HTML tags)
|
||||||
|
page_cleaned = util.htmlEntitiesToUtf8(page_cleaned):gsub("&", "&")
|
||||||
|
page_htmltitle = util.htmlEntitiesToUtf8(page_htmltitle):gsub("&", "&")
|
||||||
local sections = phtml.sections -- Wikipedia provided TOC
|
local sections = phtml.sections -- Wikipedia provided TOC
|
||||||
local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
|
local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
|
||||||
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
||||||
@ -678,7 +684,7 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
|||||||
local images = {}
|
local images = {}
|
||||||
local seen_images = {}
|
local seen_images = {}
|
||||||
local imagenum = 1
|
local imagenum = 1
|
||||||
local cover_imgid = "" -- best candidate for cover among our images
|
local cover_imgid = nil -- best candidate for cover among our images
|
||||||
local processImg = function(img_tag)
|
local processImg = function(img_tag)
|
||||||
local src = img_tag:match([[src="([^"]*)"]])
|
local src = img_tag:match([[src="([^"]*)"]])
|
||||||
if src == nil or src == "" then
|
if src == nil or src == "" then
|
||||||
@ -753,7 +759,7 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
|||||||
table.insert(images, cur_image)
|
table.insert(images, cur_image)
|
||||||
seen_images[src] = cur_image
|
seen_images[src] = cur_image
|
||||||
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
|
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
|
||||||
if cover_imgid == "" and width and width > 50 and height and height > 50 and height > width then
|
if not cover_imgid and width and width > 50 and height and height > 50 and height > width then
|
||||||
cover_imgid = imgid
|
cover_imgid = imgid
|
||||||
end
|
end
|
||||||
imagenum = imagenum + 1
|
imagenum = imagenum + 1
|
||||||
@ -847,6 +853,10 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
|||||||
end
|
end
|
||||||
local content_opf_parts = {}
|
local content_opf_parts = {}
|
||||||
-- head
|
-- head
|
||||||
|
local meta_cover = "<!-- no cover image -->"
|
||||||
|
if include_images and cover_imgid then
|
||||||
|
meta_cover = string.format([[<meta name="cover" content="%s"/>]], cover_imgid)
|
||||||
|
end
|
||||||
table.insert(content_opf_parts, string.format([[
|
table.insert(content_opf_parts, string.format([[
|
||||||
<?xml version='1.0' encoding='utf-8'?>
|
<?xml version='1.0' encoding='utf-8'?>
|
||||||
<package xmlns="http://www.idpf.org/2007/opf"
|
<package xmlns="http://www.idpf.org/2007/opf"
|
||||||
@ -858,13 +868,13 @@ function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
|||||||
<dc:identifier id="bookid">%s</dc:identifier>
|
<dc:identifier id="bookid">%s</dc:identifier>
|
||||||
<dc:language>%s</dc:language>
|
<dc:language>%s</dc:language>
|
||||||
<dc:publisher>%s</dc:publisher>
|
<dc:publisher>%s</dc:publisher>
|
||||||
<meta name="cover" content="%s"/>
|
%s
|
||||||
</metadata>
|
</metadata>
|
||||||
<manifest>
|
<manifest>
|
||||||
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
||||||
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
|
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
|
||||||
<item id="css" href="stylesheet.css" media-type="text/css"/>
|
<item id="css" href="stylesheet.css" media-type="text/css"/>
|
||||||
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, cover_imgid))
|
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, meta_cover))
|
||||||
-- images files
|
-- images files
|
||||||
if include_images then
|
if include_images then
|
||||||
for inum, img in ipairs(images) do
|
for inum, img in ipairs(images) do
|
||||||
@ -970,9 +980,17 @@ time, abbr, sup {
|
|||||||
-- Wikipedia sections items seem to be already sorted by index, so no need to sort
|
-- Wikipedia sections items seem to be already sorted by index, so no need to sort
|
||||||
for isec, s in ipairs(sections) do
|
for isec, s in ipairs(sections) do
|
||||||
num = num + 1
|
num = num + 1
|
||||||
local s_anchor = s.anchor
|
-- Some chars in headings are converted to html entities in the
|
||||||
|
-- wikipedia-generated HTML. We need to do the same in TOC links
|
||||||
|
-- for the links to be valid.
|
||||||
|
local s_anchor = s.anchor:gsub("&", "&"):gsub('"', """):gsub(">", ">"):gsub("<", "<")
|
||||||
local s_title = string.format("%s %s", s.number, s.line)
|
local s_title = string.format("%s %s", s.number, s.line)
|
||||||
s_title = (s_title:gsub("(%b<>)", "")) -- titles may include <i> and other html tags
|
-- Titles may include <i> and other html tags: let's remove them as
|
||||||
|
-- our TOC can only display text
|
||||||
|
s_title = (s_title:gsub("(%b<>)", ""))
|
||||||
|
-- We need to do as for page_htmltitle above. But headings can contain
|
||||||
|
-- html entities for < and > that we need to put back as html entities
|
||||||
|
s_title = util.htmlEntitiesToUtf8(s_title):gsub("&", "&"):gsub(">", ">"):gsub("<", "<")
|
||||||
local s_level = s.toclevel
|
local s_level = s.toclevel
|
||||||
if s_level > depth then
|
if s_level > depth then
|
||||||
depth = s_level -- max depth required in toc.ncx
|
depth = s_level -- max depth required in toc.ncx
|
||||||
@ -1073,31 +1091,21 @@ time, abbr, sup {
|
|||||||
|
|
||||||
-- Fix internal wikipedia links with full server url (including lang) so
|
-- Fix internal wikipedia links with full server url (including lang) so
|
||||||
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
|
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
|
||||||
-- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]])
|
-- We need to remove any "?somekey=somevalue" from url (a real "?" part of the
|
||||||
--
|
-- wiki_page word would be encoded as %3F, but ReaderLink would get it decoded and
|
||||||
-- Also, crengine deals strangely with percent encoded utf8 :
|
-- would not be able to distinguish them).
|
||||||
-- if the link in the html is : <a href="http://fr.wikipedia.org/wiki/Fran%C3%A7oix">
|
|
||||||
-- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Fran____oix
|
|
||||||
-- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8,
|
|
||||||
-- when we should have get "\xc3\xa7" ...
|
|
||||||
-- We can avoid that by putting in the url plain unencoded UTF8
|
|
||||||
local hex_to_char = function(x) return string.char(tonumber(x, 16)) end
|
|
||||||
-- Do that first (need to be done first) for full links to other language wikipedias
|
-- Do that first (need to be done first) for full links to other language wikipedias
|
||||||
local fixEncodedOtherLangWikiPageTitle = function(wiki_lang, wiki_page)
|
local cleanOtherLangWikiPageTitle = function(wiki_lang, wiki_page)
|
||||||
-- First, remove any "?otherkey=othervalue" from url (a real "?" part of the wiki_page word
|
|
||||||
-- would be encoded as %3f), that could cause problem when used.
|
|
||||||
wiki_page = wiki_page:gsub("%?.*", "")
|
wiki_page = wiki_page:gsub("%?.*", "")
|
||||||
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
|
|
||||||
return string.format([[href="https://%s.wikipedia.org/wiki/%s"]], wiki_lang, wiki_page)
|
return string.format([[href="https://%s.wikipedia.org/wiki/%s"]], wiki_lang, wiki_page)
|
||||||
end
|
end
|
||||||
html = html:gsub([[href="https?://([^%.]+).wikipedia.org/wiki/([^"]*)"]], fixEncodedOtherLangWikiPageTitle)
|
html = html:gsub([[href="https?://([^%.]+).wikipedia.org/wiki/([^"]*)"]], cleanOtherLangWikiPageTitle)
|
||||||
-- Now, do it for same wikipedia short urls
|
-- Now, do it for same wikipedia short urls
|
||||||
local fixEncodedWikiPageTitle = function(wiki_page)
|
local cleanWikiPageTitle = function(wiki_page)
|
||||||
wiki_page = wiki_page:gsub("%?.*", "")
|
wiki_page = wiki_page:gsub("%?.*", "")
|
||||||
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
|
|
||||||
return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page)
|
return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page)
|
||||||
end
|
end
|
||||||
html = html:gsub([[href="/wiki/([^"]*)"]], fixEncodedWikiPageTitle)
|
html = html:gsub([[href="/wiki/([^"]*)"]], cleanWikiPageTitle)
|
||||||
|
|
||||||
-- Remove href from links to non existant wiki page so they are not clickable :
|
-- Remove href from links to non existant wiki page so they are not clickable :
|
||||||
-- <a href="/w/index.php?title=PageTitle&action=edit&redlink=1" class="new" title="PageTitle">PageTitle____on</a>
|
-- <a href="/w/index.php?title=PageTitle&action=edit&redlink=1" class="new" title="PageTitle">PageTitle____on</a>
|
||||||
|
@ -529,15 +529,16 @@ function util.unicodeCodepointToUtf8(c)
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
-- we need to use an array of arrays to keep them ordered as written
|
||||||
local HTML_ENTITIES_TO_UTF8 = {
|
local HTML_ENTITIES_TO_UTF8 = {
|
||||||
["<"] = "<",
|
{"<", "<"},
|
||||||
[">"] = ">",
|
{">", ">"},
|
||||||
["""] = '"',
|
{""", '"'},
|
||||||
["'"] = "'",
|
{"'", "'"},
|
||||||
[" "] = "\xC2\xA0",
|
{" ", "\xC2\xA0"},
|
||||||
["&#(%d+);"] = function(x) return util.unicodeCodepointToUtf8(tonumber(x)) end,
|
{"&#(%d+);", function(x) return util.unicodeCodepointToUtf8(tonumber(x)) end},
|
||||||
["&#x(%x+);"] = function(x) return util.unicodeCodepointToUtf8(tonumber(x,16)) end,
|
{"&#x(%x+);", function(x) return util.unicodeCodepointToUtf8(tonumber(x,16)) end},
|
||||||
["&"] = "&", -- must be last
|
{"&", "&"}, -- must be last
|
||||||
}
|
}
|
||||||
--- Replace HTML entities with their UTF8 equivalent in text
|
--- Replace HTML entities with their UTF8 equivalent in text
|
||||||
--
|
--
|
||||||
@ -546,8 +547,8 @@ local HTML_ENTITIES_TO_UTF8 = {
|
|||||||
--- @int string text with HTML entities
|
--- @int string text with HTML entities
|
||||||
--- @treturn string UTF8 text
|
--- @treturn string UTF8 text
|
||||||
function util.htmlEntitiesToUtf8(text)
|
function util.htmlEntitiesToUtf8(text)
|
||||||
for k,v in pairs(HTML_ENTITIES_TO_UTF8) do
|
for _, t in ipairs(HTML_ENTITIES_TO_UTF8) do
|
||||||
text = text:gsub(k, v)
|
text = text:gsub(t[1], t[2])
|
||||||
end
|
end
|
||||||
return text
|
return text
|
||||||
end
|
end
|
||||||
|
Loading…
Reference in New Issue
Block a user