2
0
mirror of https://github.com/koreader/koreader synced 2024-11-18 03:25:46 +00:00
koreader/frontend/ui/wikipedia.lua
poire-z de54ef5ae6 Added Trapper module, for simple interaction with UI
This module provides methods for simple interaction with UI, without
the need for explicit callbacks, for use by linear jobs between
their steps.
Uses coroutines, but their usage is hidden by a simple API.
Factored out of Wikipedia:createEpubWithUI().
2017-09-20 23:09:59 +02:00

843 lines
35 KiB
Lua

local JSON = require("json")
local logger = require("logger")
local util = require("ffi/util")
local _ = require("gettext")
local T = require("ffi/util").template
--[[
-- Query wikipedia using Wikimedia Web API.
-- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max
-- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader
--
-- To get parsed HTML :
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection
-- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse
--]]
local Wikipedia = {
wiki_server = "https://%s.wikipedia.org",
wiki_path = "/w/api.php",
wiki_params = {
action = "query",
prop = "extracts",
format = "json",
-- exintro = nil, -- get more than only the intro
explaintext = "",
redirects = "",
-- title = nil, -- text to lookup, will be added below
},
default_lang = "en",
-- Search query for better results
-- see https://www.mediawiki.org/wiki/API:Main_page
wiki_search_params = {
action = "query",
generator = "search",
gsrnamespace = "0",
-- gsrsearch = nil, -- text to lookup, will be added below
gsrlimit = 20, -- max nb of results to get
exlimit = "max",
prop = "extracts|info", -- 'extracts' to get text, 'info' to get full page length
format = "json",
explaintext = "",
exintro = "",
-- We have to use 'exintro=' to get extracts for ALL results
-- (otherwise, we get the full text for only the first result, and
-- no text at all for the others
},
wiki_phtml_params = {
action = "parse",
format = "json",
-- we only need the following informations
prop = "text|sections|displaytitle|revid",
-- page = nil, -- text to lookup, will be added below
-- disabletoc = "", -- if we want to remove toc IN html
disablelimitreport = "",
disableeditsection = "",
},
-- allow for disabling prettifying full page text
wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"),
}
function Wikipedia:getWikiServer(lang)
return string.format(self.wiki_server, lang or self.default_lang)
end
-- Possible values for page_type parameter to loadPage()
local WIKIPEDIA_INTRO = 1
local WIKIPEDIA_FULL = 2
local WIKIPEDIA_PHTML = 3
--[[
-- return decoded JSON table from Wikipedia
--]]
function Wikipedia:loadPage(text, lang, page_type, plain)
local socket = require('socket')
local url = require('socket.url')
local http = require('socket.http')
local https = require('ssl.https')
local ltn12 = require('ltn12')
local request, sink = {}, {}
local query = ""
local parsed = url.parse(self:getWikiServer(lang))
parsed.path = self.wiki_path
if page_type == WIKIPEDIA_INTRO then -- search query
self.wiki_search_params.explaintext = plain and "" or nil
for k,v in pairs(self.wiki_search_params) do
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "gsrsearch=" .. url.escape(text)
elseif page_type == WIKIPEDIA_FULL then -- full page content
self.wiki_params.explaintext = plain and "" or nil
for k,v in pairs(self.wiki_params) do
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "titles=" .. url.escape(text)
elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content
for k,v in pairs(self.wiki_phtml_params) do
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "page=" .. url.escape(text)
else
return
end
-- HTTP request
request['url'] = url.build(parsed)
request['method'] = 'GET'
request['sink'] = ltn12.sink.table(sink)
http.TIMEOUT, https.TIMEOUT = 10, 10
local httpRequest = parsed.scheme == 'http' and http.request or https.request
-- first argument returned by skip is code
local _, headers, status = socket.skip(1, httpRequest(request))
-- raise error message when network is unavailable
if headers == nil then
error("Network is unreachable")
end
if status ~= "HTTP/1.1 200 OK" then
logger.warn("HTTP status not okay:", status)
return
end
local content = table.concat(sink)
if content ~= "" and string.sub(content, 1,1) == "{" then
local ok, result = pcall(JSON.decode, content)
if ok and result then
logger.dbg("wiki result", result)
return result
else
logger.warn("wiki error:", result)
end
else
logger.warn("not JSON from wiki response:", content)
end
end
-- search wikipedia and get intros for results
function Wikipedia:wikintro(text, lang)
local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true)
if result then
local query = result.query
if query then
return query.pages
end
end
end
-- get full content of a wiki page
function Wikipedia:wikifull(text, lang)
local result = self:loadPage(text, lang, WIKIPEDIA_FULL, true)
if result then
local query = result.query
if query then
if self.wiki_prettify then
-- Prettification of the plain text full page
for pageid, page in pairs(query.pages) do
if page.extract then
page.extract = self:prettifyText(page.extract)
end
end
end
return query.pages
end
end
end
-- get parsed html content and other infos of a wiki page
function Wikipedia:wikiphtml(text, lang)
local result = self:loadPage(text, lang, WIKIPEDIA_PHTML, true)
if result and result.parse then
return result.parse
end
if result.error and result.error.info then
error(result.error.info)
end
end
-- UTF8 of unicode geometrical shapes we can use to replace
-- the "=== title ===" of wkipedia plaintext pages
-- These chosen ones are available in most fonts (prettier symbols
-- exist in unicode, but are available in a few fonts only) and
-- have a quite consistent size/weight in all fonts.
local th1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?)
local th2_sym = "\xE2\x96\x89" -- big black square
local th3_sym = "\xC2\xA0\xE2\x97\x86" -- black diamond (indented, nicer)
local th4_sym = "\xE2\x97\xA4" -- black upper left triangle
local th5_sym = "\xE2\x9C\xBF" -- black florette
local th6_sym = "\xE2\x9D\x96" -- black diamond minus white x
-- Others available in most fonts
-- local thX_sym = "\xE2\x9C\x9A" -- heavy greek cross
-- local thX_sym = "\xE2\x97\xA2" -- black lower right triangle
-- local thX_sym = "\xE2\x97\x89" -- fish eye
-- local thX_sym = "\xE2\x96\x97" -- quadrant lower right
-- For optional prettification of the plain text full page
function Wikipedia:prettifyText(text)
-- We use \a for an additional leading \n that we don't want shortened later
text = text:gsub("\n= ", "\n\a"..th1_sym.." ") -- 2 empty lines before
text = text:gsub("\n== ", "\n\a"..th2_sym.." ") -- 2 empty lines before
text = text:gsub("\n=== ", "\n"..th3_sym.." ")
text = text:gsub("\n==== ", "\n"..th4_sym.." ")
text = text:gsub("\n===== ", "\n"..th5_sym.." ")
text = text:gsub("\n====== ", "\n"..th6_sym.." ")
text = text:gsub("Modifier ==", " ==") -- fr wikipedia fix for some articles modified by clumsy editors
text = text:gsub("==$", "==\n") -- for a </hN> at end of text to be matched by next gsub
text = text:gsub(" ===?\n+", "\n\n") -- </h2> to </h3> : empty line after
text = text:gsub(" ====+\n+", "\n") -- </h4> to </hN> : single \n, no empty line
text = text:gsub("\n\n+\xE2\x80\x94", "\n\xE2\x80\x94") -- em dash, used for quote author, make it stick to prev text
text = text:gsub("\n +\n", "\n") -- trim lines full of only spaces (often seen in math formulas)
text = text:gsub("^\n*", "") -- trim new lines at start
text = text:gsub("\n*$", "") -- trim new lines at end
text = text:gsub("\n\n+", "\n\n") -- shorten multiple new lines
text = text:gsub("\a", "\n") -- re-add our wished \n
return text
end
local function getUrlContent(url, timeout)
local socket = require('socket')
local ltn12 = require('ltn12')
local requester
if url:sub(1,7) == "http://" then
requester = require('socket.http')
elseif url:sub(1,8) == "https://" then
requester = require('ssl.https')
else
return false, "Unsupported protocol"
end
requester.TIMEOUT = timeout or 10
local request = {}
local sink = {}
request['url'] = url
request['method'] = 'GET'
request['sink'] = ltn12.sink.table(sink)
-- first argument returned by skip is code
local _, headers, status = socket.skip(1, requester.request(request))
if headers == nil then
logger.warn("No HTTP headers")
return false, "Network unavailable"
end
if status ~= "HTTP/1.1 200 OK" then
logger.warn("HTTP status not okay:", status)
return false, "Network unavailable"
end
return true, table.concat(sink)
end
-- UTF8 of unicode geometrical shapes we'll prepend to wikipedia section headers,
-- to help identifying hierarchy (othewise, the small font size differences helps).
-- Best if identical to the ones used above for prettifying full plain text page.
-- These chosen ones are available in most fonts (prettier symbols
-- exist in unicode, but are available in a few fonts only) and
-- have a quite consistent size/weight in all fonts.
local h1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?)
local h2_sym = "\xE2\x96\x89" -- big black square
local h3_sym = "\xE2\x97\x86" -- black diamond
local h4_sym = "\xE2\x97\xA4" -- black upper left triangle
local h5_sym = "\xE2\x9C\xBF" -- black florette
local h6_sym = "\xE2\x9D\x96" -- black diamond minus white x
-- Other available ones in most fonts
-- local hXsym = "\xE2\x9C\x9A" -- heavy greek cross
-- local hXsym = "\xE2\x97\xA2" -- black lower right triangle
-- local hXsym = "\xE2\x97\x89" -- fish eye
-- local hXsym = "\xE2\x96\x97" -- quadrant lower right
local ext_to_mimetype = {
png = "image/png",
jpg = "image/jpeg",
jpeg = "image/jpeg",
gif = "image/gif",
svg = "image/svg+xml",
html= "application/xhtml+xml",
xhtml= "application/xhtml+xml",
ncx = "application/x-dtbncx+xml",
js = "text/javascript",
css = "text/css",
otf = "application/opentype",
ttf = "application/truetype",
woff = "application/font-woff",
}
-- Create an epub file (with possibly images)
function Wikipedia:createEpub(epub_path, page, lang, with_images)
-- Use Trapper to display progress and ask questions through the UI.
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
-- Trapper:info() and Trapper:confirm() will just use logger.
local UI = require("ui/trapper")
UI:info(_("Fetching Wikipedia page…"))
local ok, phtml = pcall(self.wikiphtml, self, page, lang)
if not ok then
UI:info(phtml) -- display error in InfoMessage
-- Sleep a bit to make that error seen
util.sleep(2)
UI:reset()
return false
end
-- We may need to build absolute urls for non-absolute links and images urls
local wiki_base_url = self:getWikiServer(lang)
-- Get infos from wikipedia result
-- (see example at https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection)
local cancelled = false
local html = phtml.text["*"] -- html content
local page_cleaned = page:gsub("_", " ") -- page title
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
local sections = phtml.sections -- Wikipedia provided TOC
local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
-- should it changes if content is updated (as now, including the wikipedia revisionId),
-- or should it stays the same even if revid changes (content of the same book updated).
-- We need to find images in HTML to tell how many when asking user if they should be included
local images = {}
local seen_images = {}
local imagenum = 1
local cover_imgid = "" -- best candidate for cover among our images
local processImg = function(img_tag)
local src = img_tag:match([[src="([^"]*)"]])
if src == nil or src == "" then
logger.info("no src found in ", img_tag)
return nil
end
if src:sub(1,2) == "//" then
src = "https:" .. src -- Wikipedia redirects from http to https, so use https
elseif src:sub(1,1) == "/" then -- non absolute url
src = wiki_base_url .. src
end
local cur_image
if seen_images[src] then -- already seen
cur_image = seen_images[src]
else
local ext = src:match(".*%.(%S+)")
if ext == nil or ext == "" then -- we won't know what mimetype to use, ignore it
logger.info("no file extension found in ", src)
return nil
end
ext = ext:lower()
local imgid = string.format("img%05d", imagenum)
local imgpath = string.format("images/%s.%s", imgid, ext)
local mimetype = ext_to_mimetype[ext] or ""
local width = tonumber(img_tag:match([[width="([^"]*)"]]))
local height = tonumber(img_tag:match([[height="([^"]*)"]]))
-- Get higher resolution (2x) image url
local src2x = nil
local srcset = img_tag:match([[srcset="([^"]*)"]])
if srcset then
srcset = " "..srcset.. ", " -- for next pattern to possibly match 1st or last item
src2x = srcset:match([[ (%S+) 2x, ]])
if src2x then
if src2x:sub(1,2) == "//" then
src2x = "https:" .. src2x
elseif src2x:sub(1,1) == "/" then -- non absolute url
src2x = wiki_base_url .. src2x
end
end
end
cur_image = {
imgid = imgid,
imgpath = imgpath,
src = src,
src2x = src2x,
mimetype = mimetype,
width = width,
height = height,
}
table.insert(images, cur_image)
seen_images[src] = cur_image
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
if cover_imgid == "" and width and width > 50 and height and height > 50 and height > width then
cover_imgid = imgid
end
imagenum = imagenum + 1
end
-- crengine will NOT use width and height attributes, but it will use
-- those found in a style attribute.
-- If we get src2x images, crengine will scale them down to the 1x image size
-- (less space wasted by images while reading), but the 2x quality will be
-- there when image is viewed full screen with ImageViewer widget.
local style_props = {}
if cur_image.width then
table.insert(style_props, string.format("width: %spx", cur_image.width))
end
if cur_image.height then
table.insert(style_props, string.format("height: %spx", cur_image.height))
end
local style = table.concat(style_props, "; ")
return string.format([[<img src="%s" style="%s" alt=""/>]], cur_image.imgpath, style)
end
html = html:gsub("(<%s*img [^>]*>)", processImg)
logger.dbg("Images found in html:", images)
-- See what to do with images
local include_images = false
local use_img_2x = false
if with_images then
-- If no UI (Trapper:wrap() not called), UI:confirm() will answer true
if #images > 0 then
include_images = UI:confirm(T(_("The page contains %1 images.\nWould you like to download and include them in the generated EPUB file?"), #images), _("Don't include"), _("Include"))
if include_images then
use_img_2x = UI:confirm(_("Would you like to use slightly higher quality images? This will result in a bigger file size."), _("Standard quality"), _("Higher quality"))
end
else
UI:info(_("The page does not contain any images."))
util.sleep(1) -- Let the user see that
end
end
if not include_images then
-- Remove img tags to avoid little blank squares of missing images
html = html:gsub("<%s*img [^>]*>", "")
-- We could remove the whole image container <div class="thumb"...> ,
-- but it's a lot of nested <div> and not easy to do.
-- So the user will see the image legends and know a bit about
-- the images he chose to not get.
end
UI:info(_("Building EPUB…"))
-- Open the zip file (with .tmp for now, as crengine may still
-- have a handle to the final epub_path, and we don't want to
-- delete a good one if we fail/cancel later)
local epub_path_tmp = epub_path .. ".tmp"
local ZipWriter = require("ffi/zipwriter")
local epub = ZipWriter:new{}
if not epub:open(epub_path_tmp) then
return false
end
-- We now create and add all the required epub files
-- ----------------------------------------------------------------
-- /mimetype : always "application/epub+zip"
epub:add("mimetype", "application/epub+zip")
-- ----------------------------------------------------------------
-- /META-INF/container.xml : always the same content
epub:add("META-INF/container.xml", [[
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>]])
-- ----------------------------------------------------------------
-- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory)
-- Other possible items in this file that are of no interest to crengine :
-- In <manifest> :
-- <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
-- <item id="cover-image" href="images/cover.png" media-type="image/png"/>
-- (crengine only uses <meta name="cover" content="cover-image" /> to get the cover image)
-- In <spine toc="ncx"> :
-- <itemref idref="cover" linear="no"/>
-- And a <guide> section :
-- <guide>
-- <reference href="title.html" type="cover" title="Cover"/>
-- <reference href="toc.html" type="toc" title="Table of Contents" href="toc.html" />
-- </guide>
local koreader_version = "KOReader"
if lfs.attributes("git-rev", "mode") == "file" then
koreader_version = "KOReader "..io.open("git-rev", "r"):read()
end
local content_opf_parts = {}
-- head
table.insert(content_opf_parts, string.format([[
<?xml version='1.0' encoding='utf-8'?>
<package xmlns="http://www.idpf.org/2007/opf"
xmlns:dc="http://purl.org/dc/elements/1.1/"
unique-identifier="bookid" version="2.0">
<metadata>
<dc:title>%s</dc:title>
<dc:creator>Wikipedia %s</dc:creator>
<dc:identifier id="bookid">%s</dc:identifier>
<dc:language>%s</dc:language>
<dc:publisher>%s</dc:publisher>
<meta name="cover" content="%s"/>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
<item id="css" href="stylesheet.css" media-type="text/css"/>
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, cover_imgid))
-- images files
if include_images then
for inum, img in ipairs(images) do
table.insert(content_opf_parts, string.format([[ <item id="%s" href="%s" media-type="%s"/>%s]], img.imgid, img.imgpath, img.mimetype, "\n"))
end
end
-- tail
table.insert(content_opf_parts, [[
</manifest>
<spine toc="ncx">
<itemref idref="content"/>
</spine>
</package>
]])
epub:add("OEBPS/content.opf", table.concat(content_opf_parts))
-- ----------------------------------------------------------------
-- OEBPS/stylesheet.css
-- crengine will use its own data/epub.css, we just add/fix a few styles
-- to look more alike wikipedia web pages (that the user can ignore
-- with "Embedded Style" off)
epub:add("OEBPS/stylesheet.css", [[
/* make section headers looks left aligned and avoid some page breaks */
h1, h2 {
text-align: left;
}
h3, h4, h5, h6, h7 {
page-break-before: avoid;
page-break-after: avoid;
text-align: left;
}
/* avoid page breaks around our centered titles on first page */
h1.koreaderwikifrontpage, h5.koreaderwikifrontpage {
page-break-before: avoid;
page-break-inside: avoid;
page-break-after: avoid;
text-align: center;
margin-top: 0em;
}
p.koreaderwikifrontpage {
font-style: italic;
font-size: 90%;
margin-left: 2em;
margin-right: 2em;
margin-top: 1em;
margin-bottom: 1em;
}
hr.koreaderwikifrontpage {
margin-left: 20%;
margin-right: 20%;
margin-bottom: 1.2em;
}
/* So many links, make them look like normal text except for underline */
a {
display:inline;
text-decoration: underline;
color: black,
font-weight: normal;
}
/* No underline for links without their href that we removed */
a.newwikinonexistent {
text-decoration: none;
}
/* show a box around image thumbnails */
div.thumb {
width: 80%;
border: dotted 1px black;
margin-top: 0.5em;
margin-bottom: 0.5em;
margin-left: 2.5em;
margin-right: 2.5em;
padding-top: ]].. (include_images and "0.5em" or "0.15em") .. [[;
padding-bottom: 0.2em;
padding-left: 0.5em;
padding-right: 0.5em;
text-align: center;
font-size: 90%;
}
/* don't waste left margin for notes and list of pages */
ul, ol {
margin-left: 0em;
}
/* helps crengine to not display them as block elements */
time, abbr, sup {
display: inline;
}
]])
-- ----------------------------------------------------------------
-- OEBPS/toc.ncx : table of content
local toc_ncx_parts = {}
local depth = 0
local cur_level = 0
local np_end = [[</navPoint>]]
local num = 1
-- Add our own first section for first page, with page name as title
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html"/>]], num, num, page_cleaned))
table.insert(toc_ncx_parts, np_end)
-- Wikipedia sections items seem to be already sorted by index, so no need to sort
for isec, s in ipairs(sections) do
num = num + 1
local s_anchor = s.anchor
local s_title = string.format("%s %s", s.number, s.line)
s_title = (s_title:gsub("(%b<>)", "")) -- titles may include <i> and other html tags
local s_level = s.toclevel
if s_level > depth then
depth = s_level -- max depth required in toc.ncx
end
if s_level == cur_level then
table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint
elseif s_level < cur_level then
table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint
while s_level < cur_level do -- close all in-between navPoint
table.insert(toc_ncx_parts, np_end)
cur_level = cur_level - 1
end
elseif s_level > cur_level + 1 then
-- a jump from level N to level N+2 or more ... should not happen
-- per epub spec, but we don't know about wikipedia...
-- so we create missing intermediate navPoints with same anchor as current section
while s_level > cur_level + 1 do
table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level))
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>-</text></navLabel><content src="content.html#%s"/>]], num, num, s_anchor))
cur_level = cur_level + 1
num = num + 1
end
-- elseif s_level == cur_level + 1 then
-- sublevel, nothing to close, nothing to add
end
cur_level = s_level
table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level)) -- indentation, in case a person looks at it
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html#%s"/>]], num, num, s_title, s_anchor))
end
-- close nested <navPoint>
while cur_level > 0 do
table.insert(toc_ncx_parts, np_end)
cur_level = cur_level - 1
end
-- Prepend NCX head
table.insert(toc_ncx_parts, 1, string.format([[
<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="%s"/>
<meta name="dtb:depth" content="%s"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>%s</text>
</docTitle>
<navMap>
]], bookid, depth, page_cleaned))
-- Append NCX tail
table.insert(toc_ncx_parts, [[
</navMap>
</ncx>
]])
epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts))
-- ----------------------------------------------------------------
-- OEBPS/content.html
-- Some small fixes to Wikipedia HTML to make crengine and the user happier
-- Most images are in a link to the image info page, which is a useless
-- external link for us, so let's remove this link.
html = html:gsub("<a[^>]*>%s*(<%s*img [^>]*>)%s*</a>", "%1")
-- For some <div class="thumb tright"> , which include nested divs, although
-- perfectly balanced, crengine seems to miss some closing </div> and we
-- end up having our image bordered box including the remaining main wiki text.
-- It looks like this code is supposed to deal with class= containing multiple
-- class names :
-- https://github.com/koreader/crengine/commit/0930ec7230e720c148fd6f231d69558832b4d53a
-- and that it may stumble on some cases.
-- It's all perfectly fine if we make all these div with a single class name
-- html = html:gsub([[<div class="thumb [^"]*">]], [[<div class="thumb">]])
--
-- But we may as well make all class= have a single name to avoid other problems
-- (no real risk with that, as we don't define any style for wikipedia class names,
-- except div.thumb that always appears first).
html = html:gsub([[(<[^>]* class="[^ "]+)%s+[^"]*"]], [[%1"]])
-- crengine seems to consider unknown tag as 'block' elements, so we may
-- want to remove or replace those that should be considered 'inline' elements
html = html:gsub("</?time[^>]*>", "")
-- Fix internal wikipedia links with full server url (including lang) so
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
-- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]])
--
-- Also, crengine deals strangely with percent encoded utf8 :
-- if the link in the html is : <a href="http://fr.wikipedia.org/wiki/Fran%C3%A7oix">
-- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Fran____oix
-- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8,
-- when we should have get "\xc3\xa7" ...
-- We can avoid that by putting in the url plain unencoded UTF8
local hex_to_char = function(x) return string.char(tonumber(x, 16)) end
local fixEncodedWikiPageTitle = function(wiki_page)
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page)
end
html = html:gsub([[href="/wiki/([^"]*)"]], fixEncodedWikiPageTitle)
-- Remove href from links to non existant wiki page so they are not clickable :
-- <a href="/w/index.php?title=PageTitle&amp;action=edit&amp;redlink=1" class="new" title="PageTitle">PageTitle____on</a>
-- (removal of the href="" will make them non clickable)
html = html:gsub([[<a[^>]* class="new"[^>]*>]], [[<a class="newwikinonexistent">]])
-- Fix some other protocol-less links to wikipedia (href="//fr.wikipedia.org/w/index.php..)
html = html:gsub([[href="//]], [[href="https://]])
-- crengine does not return link if multiple class names in <a> (<a class="external text" href="">)
-- it would be no problem as we can't follow them, but when the user tap
-- on it, the tap is propagated to other widgets and page change happen...
-- html = html:gsub([[<a rel="nofollow" class="external text"]], [[<a rel="nofollow" class="externaltext"]])
-- html = html:gsub([[<a class="external text"]], [[<a class="externaltext"]])
-- Solved by our multiple class names suppression above
-- Avoid link being clickable before <a> (if it starts a line) or after </a> (if it
-- ends a line or a block) by wrapping it with U+200B ZERO WIDTH SPACE which will
-- make the DOM tree walking code to find a link stop at it.
-- html = html:gsub("(<[aA])", "\xE2\x80\x8B%1")
-- html = html:gsub("(</[aA]>)", "%1\xE2\x80\x8B")
-- Fixed in crengine lvtinydom.
if self.wiki_prettify then
-- Prepend some symbols to section titles for a better visual feeling of hierarchy
html = html:gsub("<h1>", "<h1> "..h1_sym.." ")
html = html:gsub("<h2>", "<h2> "..h2_sym.." ")
html = html:gsub("<h3>", "<h3> "..h3_sym.." ")
html = html:gsub("<h4>", "<h4> "..h4_sym.." ")
html = html:gsub("<h5>", "<h5> "..h5_sym.." ")
html = html:gsub("<h6>", "<h6> "..h6_sym.." ")
end
-- Note: in all the gsub patterns above, we used lowercase for tags and attributes
-- because it's how they are in wikipedia HTML and it makes the pattern simple.
-- If one day this changes, they'll have to be replaced with href => [Hh][Rr][Ee][Ff] ...
-- We can finally build the final HTML with some header of our own
local saved_on = T(_("Saved on %1"), os.date("%b %d, %Y %H:%M:%S"))
local online_version_htmllink = string.format([[<a href="%s/wiki/%s">%s</a>]], wiki_base_url, page:gsub(" ", "_"), _("online version"))
local see_online_version = T(_("See %1 for up-to-date content"), online_version_htmllink)
epub:add("OEBPS/content.html", string.format([[
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>%s</title>
<link type="text/css" rel="stylesheet" href="stylesheet.css"/>
</head>
<body>
<h1 class="koreaderwikifrontpage">%s</h1>
<h5 class="koreaderwikifrontpage">Wikipedia %s</h5>
<p class="koreaderwikifrontpage">%s<br/>%s</p>
<hr class="koreaderwikifrontpage"/>
%s
</body>
</html>
]], page_cleaned, page_htmltitle, lang:upper(), saved_on, see_online_version, html))
-- ----------------------------------------------------------------
-- OEBPS/images/*
if include_images then
local nb_images = #images
for inum, img in ipairs(images) do
-- Process can be interrupted at this point between each image download
-- by tapping while the InfoMessage is displayed
local go_on = UI:info(T(_("Fetching image %1 / %2 …"), inum, nb_images))
if not go_on then
cancelled = true
break
end
local src = img.src
if use_img_2x and img.src2x then
src = img.src2x
end
logger.dbg("Getting img ", src)
local success, content = getUrlContent(src)
-- success, content = getUrlContent(src..".unexistant") -- to simulate failure
if success then
logger.dbg("success, size:", #content)
else
logger.info("failed fetching:", src)
end
if success then
epub:add("OEBPS/"..img.imgpath, content)
else
go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue"))
if not go_on then
cancelled = true
break
end
end
end
end
-- Done with adding files
if cancelled then
if UI:confirm(_("Download did not complete.\nDo you want to create an EPUB with the already downloaded images?"), _("Don't create"), _("Create")) then
cancelled = false
end
end
if cancelled then
UI:info(_("Canceled. Cleaning up…"))
else
UI:info(_("Packing EPUB…"))
end
epub:close()
-- This was nearly a no-op, so sleep a bit to make that progress step seen
util.usleep(300000)
UI:reset() -- close last InfoMessage
if cancelled then
-- Build was cancelled, remove half created .epub
if lfs.attributes(epub_path_tmp, "mode") == "file" then
os.remove(epub_path_tmp)
end
return false
end
-- Finally move the .tmp to the final file
os.rename(epub_path_tmp, epub_path)
logger.info("successfully created:", epub_path)
return true
end
-- Wrap Wikipedia:createEpub() with UI progress info, provided
-- by Trapper module.
function Wikipedia:createEpubWithUI(epub_path, page, lang, result_callback)
-- To do any UI interaction while building the EPUB, we need
-- to use a coroutine, so that our code can be suspended while waiting
-- for user interaction, and resumed by UI widgets callbacks.
-- All this is hidden and done by Trapper with a simple API.
local Trapper = require("ui/trapper")
Trapper:wrap(function()
Trapper:setPausedText("Download paused")
-- If errors in Wikipedia:createEpub(), the coroutine (used by
-- Trapper) would just abort (no reader crash, no error logged).
-- So we use pcall to catch any errors, log it, and report
-- the failure via result_callback.
local ok, success = pcall(self.createEpub, self, epub_path, page, lang, true)
if ok and success then
result_callback(true)
else
Trapper:reset() -- close any last widget not cleaned if error
logger.warn("Wikipedia.createEpub pcall:", ok, success)
result_callback(false)
end
end)
end
return Wikipedia