mirror of
https://github.com/koreader/koreader
synced 2024-10-31 21:20:20 +00:00
de54ef5ae6
This module provides methods for simple interaction with UI, without the need for explicit callbacks, for use by linear jobs between their steps. Uses coroutines, but their usage is hidden by a simple API. Factored out of Wikipedia:createEpubWithUI().
843 lines
35 KiB
Lua
843 lines
35 KiB
Lua
local JSON = require("json")
|
|
local logger = require("logger")
|
|
local util = require("ffi/util")
|
|
local _ = require("gettext")
|
|
local T = require("ffi/util").template
|
|
|
|
--[[
|
|
-- Query wikipedia using Wikimedia Web API.
|
|
-- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max
|
|
-- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader
|
|
--
|
|
-- To get parsed HTML :
|
|
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book
|
|
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection
|
|
-- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse
|
|
--]]
|
|
|
|
local Wikipedia = {
|
|
wiki_server = "https://%s.wikipedia.org",
|
|
wiki_path = "/w/api.php",
|
|
wiki_params = {
|
|
action = "query",
|
|
prop = "extracts",
|
|
format = "json",
|
|
-- exintro = nil, -- get more than only the intro
|
|
explaintext = "",
|
|
redirects = "",
|
|
-- title = nil, -- text to lookup, will be added below
|
|
},
|
|
default_lang = "en",
|
|
-- Search query for better results
|
|
-- see https://www.mediawiki.org/wiki/API:Main_page
|
|
wiki_search_params = {
|
|
action = "query",
|
|
generator = "search",
|
|
gsrnamespace = "0",
|
|
-- gsrsearch = nil, -- text to lookup, will be added below
|
|
gsrlimit = 20, -- max nb of results to get
|
|
exlimit = "max",
|
|
prop = "extracts|info", -- 'extracts' to get text, 'info' to get full page length
|
|
format = "json",
|
|
explaintext = "",
|
|
exintro = "",
|
|
-- We have to use 'exintro=' to get extracts for ALL results
|
|
-- (otherwise, we get the full text for only the first result, and
|
|
-- no text at all for the others
|
|
},
|
|
wiki_phtml_params = {
|
|
action = "parse",
|
|
format = "json",
|
|
-- we only need the following informations
|
|
prop = "text|sections|displaytitle|revid",
|
|
-- page = nil, -- text to lookup, will be added below
|
|
-- disabletoc = "", -- if we want to remove toc IN html
|
|
disablelimitreport = "",
|
|
disableeditsection = "",
|
|
},
|
|
-- allow for disabling prettifying full page text
|
|
wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"),
|
|
}
|
|
|
|
function Wikipedia:getWikiServer(lang)
|
|
return string.format(self.wiki_server, lang or self.default_lang)
|
|
end
|
|
|
|
-- Possible values for page_type parameter to loadPage()
|
|
local WIKIPEDIA_INTRO = 1
|
|
local WIKIPEDIA_FULL = 2
|
|
local WIKIPEDIA_PHTML = 3
|
|
|
|
--[[
|
|
-- return decoded JSON table from Wikipedia
|
|
--]]
|
|
function Wikipedia:loadPage(text, lang, page_type, plain)
|
|
local socket = require('socket')
|
|
local url = require('socket.url')
|
|
local http = require('socket.http')
|
|
local https = require('ssl.https')
|
|
local ltn12 = require('ltn12')
|
|
|
|
local request, sink = {}, {}
|
|
local query = ""
|
|
|
|
local parsed = url.parse(self:getWikiServer(lang))
|
|
parsed.path = self.wiki_path
|
|
if page_type == WIKIPEDIA_INTRO then -- search query
|
|
self.wiki_search_params.explaintext = plain and "" or nil
|
|
for k,v in pairs(self.wiki_search_params) do
|
|
query = string.format("%s%s=%s&", query, k, v)
|
|
end
|
|
parsed.query = query .. "gsrsearch=" .. url.escape(text)
|
|
elseif page_type == WIKIPEDIA_FULL then -- full page content
|
|
self.wiki_params.explaintext = plain and "" or nil
|
|
for k,v in pairs(self.wiki_params) do
|
|
query = string.format("%s%s=%s&", query, k, v)
|
|
end
|
|
parsed.query = query .. "titles=" .. url.escape(text)
|
|
elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content
|
|
for k,v in pairs(self.wiki_phtml_params) do
|
|
query = string.format("%s%s=%s&", query, k, v)
|
|
end
|
|
parsed.query = query .. "page=" .. url.escape(text)
|
|
else
|
|
return
|
|
end
|
|
|
|
-- HTTP request
|
|
request['url'] = url.build(parsed)
|
|
request['method'] = 'GET'
|
|
request['sink'] = ltn12.sink.table(sink)
|
|
http.TIMEOUT, https.TIMEOUT = 10, 10
|
|
local httpRequest = parsed.scheme == 'http' and http.request or https.request
|
|
-- first argument returned by skip is code
|
|
local _, headers, status = socket.skip(1, httpRequest(request))
|
|
|
|
-- raise error message when network is unavailable
|
|
if headers == nil then
|
|
error("Network is unreachable")
|
|
end
|
|
|
|
if status ~= "HTTP/1.1 200 OK" then
|
|
logger.warn("HTTP status not okay:", status)
|
|
return
|
|
end
|
|
|
|
local content = table.concat(sink)
|
|
if content ~= "" and string.sub(content, 1,1) == "{" then
|
|
local ok, result = pcall(JSON.decode, content)
|
|
if ok and result then
|
|
logger.dbg("wiki result", result)
|
|
return result
|
|
else
|
|
logger.warn("wiki error:", result)
|
|
end
|
|
else
|
|
logger.warn("not JSON from wiki response:", content)
|
|
end
|
|
end
|
|
|
|
-- search wikipedia and get intros for results
|
|
function Wikipedia:wikintro(text, lang)
|
|
local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true)
|
|
if result then
|
|
local query = result.query
|
|
if query then
|
|
return query.pages
|
|
end
|
|
end
|
|
end
|
|
|
|
-- get full content of a wiki page
|
|
function Wikipedia:wikifull(text, lang)
|
|
local result = self:loadPage(text, lang, WIKIPEDIA_FULL, true)
|
|
if result then
|
|
local query = result.query
|
|
if query then
|
|
if self.wiki_prettify then
|
|
-- Prettification of the plain text full page
|
|
for pageid, page in pairs(query.pages) do
|
|
if page.extract then
|
|
page.extract = self:prettifyText(page.extract)
|
|
end
|
|
end
|
|
end
|
|
return query.pages
|
|
end
|
|
end
|
|
end
|
|
|
|
-- get parsed html content and other infos of a wiki page
|
|
function Wikipedia:wikiphtml(text, lang)
|
|
local result = self:loadPage(text, lang, WIKIPEDIA_PHTML, true)
|
|
if result and result.parse then
|
|
return result.parse
|
|
end
|
|
if result.error and result.error.info then
|
|
error(result.error.info)
|
|
end
|
|
end
|
|
|
|
-- UTF8 of unicode geometrical shapes we can use to replace
|
|
-- the "=== title ===" of wkipedia plaintext pages
|
|
-- These chosen ones are available in most fonts (prettier symbols
|
|
-- exist in unicode, but are available in a few fonts only) and
|
|
-- have a quite consistent size/weight in all fonts.
|
|
local th1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?)
|
|
local th2_sym = "\xE2\x96\x89" -- big black square
|
|
local th3_sym = "\xC2\xA0\xE2\x97\x86" -- black diamond (indented, nicer)
|
|
local th4_sym = "\xE2\x97\xA4" -- black upper left triangle
|
|
local th5_sym = "\xE2\x9C\xBF" -- black florette
|
|
local th6_sym = "\xE2\x9D\x96" -- black diamond minus white x
|
|
-- Others available in most fonts
|
|
-- local thX_sym = "\xE2\x9C\x9A" -- heavy greek cross
|
|
-- local thX_sym = "\xE2\x97\xA2" -- black lower right triangle
|
|
-- local thX_sym = "\xE2\x97\x89" -- fish eye
|
|
-- local thX_sym = "\xE2\x96\x97" -- quadrant lower right
|
|
|
|
-- For optional prettification of the plain text full page
|
|
function Wikipedia:prettifyText(text)
|
|
-- We use \a for an additional leading \n that we don't want shortened later
|
|
text = text:gsub("\n= ", "\n\a"..th1_sym.." ") -- 2 empty lines before
|
|
text = text:gsub("\n== ", "\n\a"..th2_sym.." ") -- 2 empty lines before
|
|
text = text:gsub("\n=== ", "\n"..th3_sym.." ")
|
|
text = text:gsub("\n==== ", "\n"..th4_sym.." ")
|
|
text = text:gsub("\n===== ", "\n"..th5_sym.." ")
|
|
text = text:gsub("\n====== ", "\n"..th6_sym.." ")
|
|
text = text:gsub("Modifier ==", " ==") -- fr wikipedia fix for some articles modified by clumsy editors
|
|
text = text:gsub("==$", "==\n") -- for a </hN> at end of text to be matched by next gsub
|
|
text = text:gsub(" ===?\n+", "\n\n") -- </h2> to </h3> : empty line after
|
|
text = text:gsub(" ====+\n+", "\n") -- </h4> to </hN> : single \n, no empty line
|
|
text = text:gsub("\n\n+\xE2\x80\x94", "\n\xE2\x80\x94") -- em dash, used for quote author, make it stick to prev text
|
|
text = text:gsub("\n +\n", "\n") -- trim lines full of only spaces (often seen in math formulas)
|
|
text = text:gsub("^\n*", "") -- trim new lines at start
|
|
text = text:gsub("\n*$", "") -- trim new lines at end
|
|
text = text:gsub("\n\n+", "\n\n") -- shorten multiple new lines
|
|
text = text:gsub("\a", "\n") -- re-add our wished \n
|
|
return text
|
|
end
|
|
|
|
|
|
local function getUrlContent(url, timeout)
|
|
local socket = require('socket')
|
|
local ltn12 = require('ltn12')
|
|
local requester
|
|
if url:sub(1,7) == "http://" then
|
|
requester = require('socket.http')
|
|
elseif url:sub(1,8) == "https://" then
|
|
requester = require('ssl.https')
|
|
else
|
|
return false, "Unsupported protocol"
|
|
end
|
|
requester.TIMEOUT = timeout or 10
|
|
local request = {}
|
|
local sink = {}
|
|
request['url'] = url
|
|
request['method'] = 'GET'
|
|
request['sink'] = ltn12.sink.table(sink)
|
|
-- first argument returned by skip is code
|
|
local _, headers, status = socket.skip(1, requester.request(request))
|
|
|
|
if headers == nil then
|
|
logger.warn("No HTTP headers")
|
|
return false, "Network unavailable"
|
|
end
|
|
if status ~= "HTTP/1.1 200 OK" then
|
|
logger.warn("HTTP status not okay:", status)
|
|
return false, "Network unavailable"
|
|
end
|
|
|
|
return true, table.concat(sink)
|
|
end
|
|
|
|
-- UTF8 of unicode geometrical shapes we'll prepend to wikipedia section headers,
|
|
-- to help identifying hierarchy (othewise, the small font size differences helps).
|
|
-- Best if identical to the ones used above for prettifying full plain text page.
|
|
-- These chosen ones are available in most fonts (prettier symbols
|
|
-- exist in unicode, but are available in a few fonts only) and
|
|
-- have a quite consistent size/weight in all fonts.
|
|
local h1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?)
|
|
local h2_sym = "\xE2\x96\x89" -- big black square
|
|
local h3_sym = "\xE2\x97\x86" -- black diamond
|
|
local h4_sym = "\xE2\x97\xA4" -- black upper left triangle
|
|
local h5_sym = "\xE2\x9C\xBF" -- black florette
|
|
local h6_sym = "\xE2\x9D\x96" -- black diamond minus white x
|
|
-- Other available ones in most fonts
|
|
-- local hXsym = "\xE2\x9C\x9A" -- heavy greek cross
|
|
-- local hXsym = "\xE2\x97\xA2" -- black lower right triangle
|
|
-- local hXsym = "\xE2\x97\x89" -- fish eye
|
|
-- local hXsym = "\xE2\x96\x97" -- quadrant lower right
|
|
|
|
local ext_to_mimetype = {
|
|
png = "image/png",
|
|
jpg = "image/jpeg",
|
|
jpeg = "image/jpeg",
|
|
gif = "image/gif",
|
|
svg = "image/svg+xml",
|
|
html= "application/xhtml+xml",
|
|
xhtml= "application/xhtml+xml",
|
|
ncx = "application/x-dtbncx+xml",
|
|
js = "text/javascript",
|
|
css = "text/css",
|
|
otf = "application/opentype",
|
|
ttf = "application/truetype",
|
|
woff = "application/font-woff",
|
|
}
|
|
|
|
|
|
-- Create an epub file (with possibly images)
|
|
function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
|
-- Use Trapper to display progress and ask questions through the UI.
|
|
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
|
|
-- Trapper:info() and Trapper:confirm() will just use logger.
|
|
local UI = require("ui/trapper")
|
|
|
|
UI:info(_("Fetching Wikipedia page…"))
|
|
local ok, phtml = pcall(self.wikiphtml, self, page, lang)
|
|
if not ok then
|
|
UI:info(phtml) -- display error in InfoMessage
|
|
-- Sleep a bit to make that error seen
|
|
util.sleep(2)
|
|
UI:reset()
|
|
return false
|
|
end
|
|
|
|
-- We may need to build absolute urls for non-absolute links and images urls
|
|
local wiki_base_url = self:getWikiServer(lang)
|
|
|
|
-- Get infos from wikipedia result
|
|
-- (see example at https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection)
|
|
local cancelled = false
|
|
local html = phtml.text["*"] -- html content
|
|
local page_cleaned = page:gsub("_", " ") -- page title
|
|
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
|
|
local sections = phtml.sections -- Wikipedia provided TOC
|
|
local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
|
|
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
|
-- should it changes if content is updated (as now, including the wikipedia revisionId),
|
|
-- or should it stays the same even if revid changes (content of the same book updated).
|
|
|
|
-- We need to find images in HTML to tell how many when asking user if they should be included
|
|
local images = {}
|
|
local seen_images = {}
|
|
local imagenum = 1
|
|
local cover_imgid = "" -- best candidate for cover among our images
|
|
local processImg = function(img_tag)
|
|
local src = img_tag:match([[src="([^"]*)"]])
|
|
if src == nil or src == "" then
|
|
logger.info("no src found in ", img_tag)
|
|
return nil
|
|
end
|
|
if src:sub(1,2) == "//" then
|
|
src = "https:" .. src -- Wikipedia redirects from http to https, so use https
|
|
elseif src:sub(1,1) == "/" then -- non absolute url
|
|
src = wiki_base_url .. src
|
|
end
|
|
local cur_image
|
|
if seen_images[src] then -- already seen
|
|
cur_image = seen_images[src]
|
|
else
|
|
local ext = src:match(".*%.(%S+)")
|
|
if ext == nil or ext == "" then -- we won't know what mimetype to use, ignore it
|
|
logger.info("no file extension found in ", src)
|
|
return nil
|
|
end
|
|
ext = ext:lower()
|
|
local imgid = string.format("img%05d", imagenum)
|
|
local imgpath = string.format("images/%s.%s", imgid, ext)
|
|
local mimetype = ext_to_mimetype[ext] or ""
|
|
local width = tonumber(img_tag:match([[width="([^"]*)"]]))
|
|
local height = tonumber(img_tag:match([[height="([^"]*)"]]))
|
|
-- Get higher resolution (2x) image url
|
|
local src2x = nil
|
|
local srcset = img_tag:match([[srcset="([^"]*)"]])
|
|
if srcset then
|
|
srcset = " "..srcset.. ", " -- for next pattern to possibly match 1st or last item
|
|
src2x = srcset:match([[ (%S+) 2x, ]])
|
|
if src2x then
|
|
if src2x:sub(1,2) == "//" then
|
|
src2x = "https:" .. src2x
|
|
elseif src2x:sub(1,1) == "/" then -- non absolute url
|
|
src2x = wiki_base_url .. src2x
|
|
end
|
|
end
|
|
end
|
|
cur_image = {
|
|
imgid = imgid,
|
|
imgpath = imgpath,
|
|
src = src,
|
|
src2x = src2x,
|
|
mimetype = mimetype,
|
|
width = width,
|
|
height = height,
|
|
}
|
|
table.insert(images, cur_image)
|
|
seen_images[src] = cur_image
|
|
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
|
|
if cover_imgid == "" and width and width > 50 and height and height > 50 and height > width then
|
|
cover_imgid = imgid
|
|
end
|
|
imagenum = imagenum + 1
|
|
end
|
|
-- crengine will NOT use width and height attributes, but it will use
|
|
-- those found in a style attribute.
|
|
-- If we get src2x images, crengine will scale them down to the 1x image size
|
|
-- (less space wasted by images while reading), but the 2x quality will be
|
|
-- there when image is viewed full screen with ImageViewer widget.
|
|
local style_props = {}
|
|
if cur_image.width then
|
|
table.insert(style_props, string.format("width: %spx", cur_image.width))
|
|
end
|
|
if cur_image.height then
|
|
table.insert(style_props, string.format("height: %spx", cur_image.height))
|
|
end
|
|
local style = table.concat(style_props, "; ")
|
|
return string.format([[<img src="%s" style="%s" alt=""/>]], cur_image.imgpath, style)
|
|
end
|
|
html = html:gsub("(<%s*img [^>]*>)", processImg)
|
|
logger.dbg("Images found in html:", images)
|
|
|
|
-- See what to do with images
|
|
local include_images = false
|
|
local use_img_2x = false
|
|
if with_images then
|
|
-- If no UI (Trapper:wrap() not called), UI:confirm() will answer true
|
|
if #images > 0 then
|
|
include_images = UI:confirm(T(_("The page contains %1 images.\nWould you like to download and include them in the generated EPUB file?"), #images), _("Don't include"), _("Include"))
|
|
if include_images then
|
|
use_img_2x = UI:confirm(_("Would you like to use slightly higher quality images? This will result in a bigger file size."), _("Standard quality"), _("Higher quality"))
|
|
end
|
|
else
|
|
UI:info(_("The page does not contain any images."))
|
|
util.sleep(1) -- Let the user see that
|
|
end
|
|
end
|
|
if not include_images then
|
|
-- Remove img tags to avoid little blank squares of missing images
|
|
html = html:gsub("<%s*img [^>]*>", "")
|
|
-- We could remove the whole image container <div class="thumb"...> ,
|
|
-- but it's a lot of nested <div> and not easy to do.
|
|
-- So the user will see the image legends and know a bit about
|
|
-- the images he chose to not get.
|
|
end
|
|
|
|
UI:info(_("Building EPUB…"))
|
|
-- Open the zip file (with .tmp for now, as crengine may still
|
|
-- have a handle to the final epub_path, and we don't want to
|
|
-- delete a good one if we fail/cancel later)
|
|
local epub_path_tmp = epub_path .. ".tmp"
|
|
local ZipWriter = require("ffi/zipwriter")
|
|
local epub = ZipWriter:new{}
|
|
if not epub:open(epub_path_tmp) then
|
|
return false
|
|
end
|
|
|
|
-- We now create and add all the required epub files
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- /mimetype : always "application/epub+zip"
|
|
epub:add("mimetype", "application/epub+zip")
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- /META-INF/container.xml : always the same content
|
|
epub:add("META-INF/container.xml", [[
|
|
<?xml version="1.0"?>
|
|
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
|
<rootfiles>
|
|
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
|
</rootfiles>
|
|
</container>]])
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory)
|
|
-- Other possible items in this file that are of no interest to crengine :
|
|
-- In <manifest> :
|
|
-- <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
|
|
-- <item id="cover-image" href="images/cover.png" media-type="image/png"/>
|
|
-- (crengine only uses <meta name="cover" content="cover-image" /> to get the cover image)
|
|
-- In <spine toc="ncx"> :
|
|
-- <itemref idref="cover" linear="no"/>
|
|
-- And a <guide> section :
|
|
-- <guide>
|
|
-- <reference href="title.html" type="cover" title="Cover"/>
|
|
-- <reference href="toc.html" type="toc" title="Table of Contents" href="toc.html" />
|
|
-- </guide>
|
|
local koreader_version = "KOReader"
|
|
if lfs.attributes("git-rev", "mode") == "file" then
|
|
koreader_version = "KOReader "..io.open("git-rev", "r"):read()
|
|
end
|
|
local content_opf_parts = {}
|
|
-- head
|
|
table.insert(content_opf_parts, string.format([[
|
|
<?xml version='1.0' encoding='utf-8'?>
|
|
<package xmlns="http://www.idpf.org/2007/opf"
|
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
unique-identifier="bookid" version="2.0">
|
|
<metadata>
|
|
<dc:title>%s</dc:title>
|
|
<dc:creator>Wikipedia %s</dc:creator>
|
|
<dc:identifier id="bookid">%s</dc:identifier>
|
|
<dc:language>%s</dc:language>
|
|
<dc:publisher>%s</dc:publisher>
|
|
<meta name="cover" content="%s"/>
|
|
</metadata>
|
|
<manifest>
|
|
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
|
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
|
|
<item id="css" href="stylesheet.css" media-type="text/css"/>
|
|
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, cover_imgid))
|
|
-- images files
|
|
if include_images then
|
|
for inum, img in ipairs(images) do
|
|
table.insert(content_opf_parts, string.format([[ <item id="%s" href="%s" media-type="%s"/>%s]], img.imgid, img.imgpath, img.mimetype, "\n"))
|
|
end
|
|
end
|
|
-- tail
|
|
table.insert(content_opf_parts, [[
|
|
</manifest>
|
|
<spine toc="ncx">
|
|
<itemref idref="content"/>
|
|
</spine>
|
|
</package>
|
|
]])
|
|
epub:add("OEBPS/content.opf", table.concat(content_opf_parts))
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/stylesheet.css
|
|
-- crengine will use its own data/epub.css, we just add/fix a few styles
|
|
-- to look more alike wikipedia web pages (that the user can ignore
|
|
-- with "Embedded Style" off)
|
|
epub:add("OEBPS/stylesheet.css", [[
|
|
/* make section headers looks left aligned and avoid some page breaks */
|
|
h1, h2 {
|
|
text-align: left;
|
|
}
|
|
h3, h4, h5, h6, h7 {
|
|
page-break-before: avoid;
|
|
page-break-after: avoid;
|
|
text-align: left;
|
|
}
|
|
/* avoid page breaks around our centered titles on first page */
|
|
h1.koreaderwikifrontpage, h5.koreaderwikifrontpage {
|
|
page-break-before: avoid;
|
|
page-break-inside: avoid;
|
|
page-break-after: avoid;
|
|
text-align: center;
|
|
margin-top: 0em;
|
|
}
|
|
p.koreaderwikifrontpage {
|
|
font-style: italic;
|
|
font-size: 90%;
|
|
margin-left: 2em;
|
|
margin-right: 2em;
|
|
margin-top: 1em;
|
|
margin-bottom: 1em;
|
|
}
|
|
hr.koreaderwikifrontpage {
|
|
margin-left: 20%;
|
|
margin-right: 20%;
|
|
margin-bottom: 1.2em;
|
|
}
|
|
/* So many links, make them look like normal text except for underline */
|
|
a {
|
|
display:inline;
|
|
text-decoration: underline;
|
|
color: black,
|
|
font-weight: normal;
|
|
}
|
|
/* No underline for links without their href that we removed */
|
|
a.newwikinonexistent {
|
|
text-decoration: none;
|
|
}
|
|
/* show a box around image thumbnails */
|
|
div.thumb {
|
|
width: 80%;
|
|
border: dotted 1px black;
|
|
margin-top: 0.5em;
|
|
margin-bottom: 0.5em;
|
|
margin-left: 2.5em;
|
|
margin-right: 2.5em;
|
|
padding-top: ]].. (include_images and "0.5em" or "0.15em") .. [[;
|
|
padding-bottom: 0.2em;
|
|
padding-left: 0.5em;
|
|
padding-right: 0.5em;
|
|
text-align: center;
|
|
font-size: 90%;
|
|
}
|
|
/* don't waste left margin for notes and list of pages */
|
|
ul, ol {
|
|
margin-left: 0em;
|
|
}
|
|
/* helps crengine to not display them as block elements */
|
|
time, abbr, sup {
|
|
display: inline;
|
|
}
|
|
]])
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/toc.ncx : table of content
|
|
local toc_ncx_parts = {}
|
|
local depth = 0
|
|
local cur_level = 0
|
|
local np_end = [[</navPoint>]]
|
|
local num = 1
|
|
-- Add our own first section for first page, with page name as title
|
|
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html"/>]], num, num, page_cleaned))
|
|
table.insert(toc_ncx_parts, np_end)
|
|
-- Wikipedia sections items seem to be already sorted by index, so no need to sort
|
|
for isec, s in ipairs(sections) do
|
|
num = num + 1
|
|
local s_anchor = s.anchor
|
|
local s_title = string.format("%s %s", s.number, s.line)
|
|
s_title = (s_title:gsub("(%b<>)", "")) -- titles may include <i> and other html tags
|
|
local s_level = s.toclevel
|
|
if s_level > depth then
|
|
depth = s_level -- max depth required in toc.ncx
|
|
end
|
|
if s_level == cur_level then
|
|
table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint
|
|
elseif s_level < cur_level then
|
|
table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint
|
|
while s_level < cur_level do -- close all in-between navPoint
|
|
table.insert(toc_ncx_parts, np_end)
|
|
cur_level = cur_level - 1
|
|
end
|
|
elseif s_level > cur_level + 1 then
|
|
-- a jump from level N to level N+2 or more ... should not happen
|
|
-- per epub spec, but we don't know about wikipedia...
|
|
-- so we create missing intermediate navPoints with same anchor as current section
|
|
while s_level > cur_level + 1 do
|
|
table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level))
|
|
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>-</text></navLabel><content src="content.html#%s"/>]], num, num, s_anchor))
|
|
cur_level = cur_level + 1
|
|
num = num + 1
|
|
end
|
|
-- elseif s_level == cur_level + 1 then
|
|
-- sublevel, nothing to close, nothing to add
|
|
end
|
|
cur_level = s_level
|
|
table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level)) -- indentation, in case a person looks at it
|
|
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html#%s"/>]], num, num, s_title, s_anchor))
|
|
end
|
|
-- close nested <navPoint>
|
|
while cur_level > 0 do
|
|
table.insert(toc_ncx_parts, np_end)
|
|
cur_level = cur_level - 1
|
|
end
|
|
-- Prepend NCX head
|
|
table.insert(toc_ncx_parts, 1, string.format([[
|
|
<?xml version='1.0' encoding='utf-8'?>
|
|
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
|
|
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
|
<head>
|
|
<meta name="dtb:uid" content="%s"/>
|
|
<meta name="dtb:depth" content="%s"/>
|
|
<meta name="dtb:totalPageCount" content="0"/>
|
|
<meta name="dtb:maxPageNumber" content="0"/>
|
|
</head>
|
|
<docTitle>
|
|
<text>%s</text>
|
|
</docTitle>
|
|
<navMap>
|
|
]], bookid, depth, page_cleaned))
|
|
-- Append NCX tail
|
|
table.insert(toc_ncx_parts, [[
|
|
</navMap>
|
|
</ncx>
|
|
]])
|
|
epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts))
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/content.html
|
|
-- Some small fixes to Wikipedia HTML to make crengine and the user happier
|
|
|
|
-- Most images are in a link to the image info page, which is a useless
|
|
-- external link for us, so let's remove this link.
|
|
html = html:gsub("<a[^>]*>%s*(<%s*img [^>]*>)%s*</a>", "%1")
|
|
|
|
-- For some <div class="thumb tright"> , which include nested divs, although
|
|
-- perfectly balanced, crengine seems to miss some closing </div> and we
|
|
-- end up having our image bordered box including the remaining main wiki text.
|
|
-- It looks like this code is supposed to deal with class= containing multiple
|
|
-- class names :
|
|
-- https://github.com/koreader/crengine/commit/0930ec7230e720c148fd6f231d69558832b4d53a
|
|
-- and that it may stumble on some cases.
|
|
-- It's all perfectly fine if we make all these div with a single class name
|
|
-- html = html:gsub([[<div class="thumb [^"]*">]], [[<div class="thumb">]])
|
|
--
|
|
-- But we may as well make all class= have a single name to avoid other problems
|
|
-- (no real risk with that, as we don't define any style for wikipedia class names,
|
|
-- except div.thumb that always appears first).
|
|
html = html:gsub([[(<[^>]* class="[^ "]+)%s+[^"]*"]], [[%1"]])
|
|
|
|
-- crengine seems to consider unknown tag as 'block' elements, so we may
|
|
-- want to remove or replace those that should be considered 'inline' elements
|
|
html = html:gsub("</?time[^>]*>", "")
|
|
|
|
-- Fix internal wikipedia links with full server url (including lang) so
|
|
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
|
|
-- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]])
|
|
--
|
|
-- Also, crengine deals strangely with percent encoded utf8 :
|
|
-- if the link in the html is : <a href="http://fr.wikipedia.org/wiki/Fran%C3%A7oix">
|
|
-- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Fran____oix
|
|
-- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8,
|
|
-- when we should have get "\xc3\xa7" ...
|
|
-- We can avoid that by putting in the url plain unencoded UTF8
|
|
local hex_to_char = function(x) return string.char(tonumber(x, 16)) end
|
|
local fixEncodedWikiPageTitle = function(wiki_page)
|
|
wiki_page = wiki_page:gsub("%%(%x%x)", hex_to_char)
|
|
return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page)
|
|
end
|
|
html = html:gsub([[href="/wiki/([^"]*)"]], fixEncodedWikiPageTitle)
|
|
|
|
-- Remove href from links to non existant wiki page so they are not clickable :
|
|
-- <a href="/w/index.php?title=PageTitle&action=edit&redlink=1" class="new" title="PageTitle">PageTitle____on</a>
|
|
-- (removal of the href="" will make them non clickable)
|
|
html = html:gsub([[<a[^>]* class="new"[^>]*>]], [[<a class="newwikinonexistent">]])
|
|
|
|
-- Fix some other protocol-less links to wikipedia (href="//fr.wikipedia.org/w/index.php..)
|
|
html = html:gsub([[href="//]], [[href="https://]])
|
|
|
|
-- crengine does not return link if multiple class names in <a> (<a class="external text" href="">)
|
|
-- it would be no problem as we can't follow them, but when the user tap
|
|
-- on it, the tap is propagated to other widgets and page change happen...
|
|
-- html = html:gsub([[<a rel="nofollow" class="external text"]], [[<a rel="nofollow" class="externaltext"]])
|
|
-- html = html:gsub([[<a class="external text"]], [[<a class="externaltext"]])
|
|
-- Solved by our multiple class names suppression above
|
|
|
|
-- Avoid link being clickable before <a> (if it starts a line) or after </a> (if it
|
|
-- ends a line or a block) by wrapping it with U+200B ZERO WIDTH SPACE which will
|
|
-- make the DOM tree walking code to find a link stop at it.
|
|
-- html = html:gsub("(<[aA])", "\xE2\x80\x8B%1")
|
|
-- html = html:gsub("(</[aA]>)", "%1\xE2\x80\x8B")
|
|
-- Fixed in crengine lvtinydom.
|
|
|
|
if self.wiki_prettify then
|
|
-- Prepend some symbols to section titles for a better visual feeling of hierarchy
|
|
html = html:gsub("<h1>", "<h1> "..h1_sym.." ")
|
|
html = html:gsub("<h2>", "<h2> "..h2_sym.." ")
|
|
html = html:gsub("<h3>", "<h3> "..h3_sym.." ")
|
|
html = html:gsub("<h4>", "<h4> "..h4_sym.." ")
|
|
html = html:gsub("<h5>", "<h5> "..h5_sym.." ")
|
|
html = html:gsub("<h6>", "<h6> "..h6_sym.." ")
|
|
end
|
|
|
|
-- Note: in all the gsub patterns above, we used lowercase for tags and attributes
|
|
-- because it's how they are in wikipedia HTML and it makes the pattern simple.
|
|
-- If one day this changes, they'll have to be replaced with href => [Hh][Rr][Ee][Ff] ...
|
|
|
|
-- We can finally build the final HTML with some header of our own
|
|
local saved_on = T(_("Saved on %1"), os.date("%b %d, %Y %H:%M:%S"))
|
|
local online_version_htmllink = string.format([[<a href="%s/wiki/%s">%s</a>]], wiki_base_url, page:gsub(" ", "_"), _("online version"))
|
|
local see_online_version = T(_("See %1 for up-to-date content"), online_version_htmllink)
|
|
epub:add("OEBPS/content.html", string.format([[
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head>
|
|
<title>%s</title>
|
|
<link type="text/css" rel="stylesheet" href="stylesheet.css"/>
|
|
</head>
|
|
<body>
|
|
<h1 class="koreaderwikifrontpage">%s</h1>
|
|
<h5 class="koreaderwikifrontpage">Wikipedia %s</h5>
|
|
<p class="koreaderwikifrontpage">%s<br/>%s</p>
|
|
<hr class="koreaderwikifrontpage"/>
|
|
%s
|
|
</body>
|
|
</html>
|
|
]], page_cleaned, page_htmltitle, lang:upper(), saved_on, see_online_version, html))
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/images/*
|
|
if include_images then
|
|
local nb_images = #images
|
|
for inum, img in ipairs(images) do
|
|
-- Process can be interrupted at this point between each image download
|
|
-- by tapping while the InfoMessage is displayed
|
|
local go_on = UI:info(T(_("Fetching image %1 / %2 …"), inum, nb_images))
|
|
if not go_on then
|
|
cancelled = true
|
|
break
|
|
end
|
|
local src = img.src
|
|
if use_img_2x and img.src2x then
|
|
src = img.src2x
|
|
end
|
|
logger.dbg("Getting img ", src)
|
|
local success, content = getUrlContent(src)
|
|
-- success, content = getUrlContent(src..".unexistant") -- to simulate failure
|
|
if success then
|
|
logger.dbg("success, size:", #content)
|
|
else
|
|
logger.info("failed fetching:", src)
|
|
end
|
|
if success then
|
|
epub:add("OEBPS/"..img.imgpath, content)
|
|
else
|
|
go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue"))
|
|
if not go_on then
|
|
cancelled = true
|
|
break
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
-- Done with adding files
|
|
if cancelled then
|
|
if UI:confirm(_("Download did not complete.\nDo you want to create an EPUB with the already downloaded images?"), _("Don't create"), _("Create")) then
|
|
cancelled = false
|
|
end
|
|
end
|
|
if cancelled then
|
|
UI:info(_("Canceled. Cleaning up…"))
|
|
else
|
|
UI:info(_("Packing EPUB…"))
|
|
end
|
|
epub:close()
|
|
-- This was nearly a no-op, so sleep a bit to make that progress step seen
|
|
util.usleep(300000)
|
|
UI:reset() -- close last InfoMessage
|
|
|
|
if cancelled then
|
|
-- Build was cancelled, remove half created .epub
|
|
if lfs.attributes(epub_path_tmp, "mode") == "file" then
|
|
os.remove(epub_path_tmp)
|
|
end
|
|
return false
|
|
end
|
|
|
|
-- Finally move the .tmp to the final file
|
|
os.rename(epub_path_tmp, epub_path)
|
|
logger.info("successfully created:", epub_path)
|
|
return true
|
|
end
|
|
|
|
|
|
-- Wrap Wikipedia:createEpub() with UI progress info, provided
|
|
-- by Trapper module.
|
|
function Wikipedia:createEpubWithUI(epub_path, page, lang, result_callback)
|
|
-- To do any UI interaction while building the EPUB, we need
|
|
-- to use a coroutine, so that our code can be suspended while waiting
|
|
-- for user interaction, and resumed by UI widgets callbacks.
|
|
-- All this is hidden and done by Trapper with a simple API.
|
|
local Trapper = require("ui/trapper")
|
|
Trapper:wrap(function()
|
|
Trapper:setPausedText("Download paused")
|
|
-- If errors in Wikipedia:createEpub(), the coroutine (used by
|
|
-- Trapper) would just abort (no reader crash, no error logged).
|
|
-- So we use pcall to catch any errors, log it, and report
|
|
-- the failure via result_callback.
|
|
local ok, success = pcall(self.createEpub, self, epub_path, page, lang, true)
|
|
if ok and success then
|
|
result_callback(true)
|
|
else
|
|
Trapper:reset() -- close any last widget not cleaned if error
|
|
logger.warn("Wikipedia.createEpub pcall:", ok, success)
|
|
result_callback(false)
|
|
end
|
|
end)
|
|
end
|
|
|
|
return Wikipedia
|