mirror of
https://github.com/koreader/koreader
synced 2024-11-16 06:12:56 +00:00
f8d8863712
Split the In-page footnotes tweak into 3 distinct ones, mainly because I want the 3rd one with classic class names to be disabled'able, while keeping the others, in case these classic class names are not used for footnotes. Also fix footnotes list-style-type in Wikipedia EPUBs, which may have been wrong (but it was less noticable when following page links because of the little black marker, or showing them in popup footnotes where the number/letter is not shown).
1322 lines
57 KiB
Lua
1322 lines
57 KiB
Lua
local JSON = require("json")
|
|
local RenderImage = require("ui/renderimage")
|
|
local Screen = require("device").screen
|
|
local ffiutil = require("ffi/util")
|
|
local logger = require("logger")
|
|
local util = require("util")
|
|
local _ = require("gettext")
|
|
local T = ffiutil.template
|
|
|
|
--[[
|
|
-- Query wikipedia using Wikimedia Web API.
|
|
-- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max
|
|
-- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader
|
|
--
|
|
-- To get parsed HTML :
|
|
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book
|
|
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection
|
|
-- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse
|
|
--]]
|
|
|
|
local Wikipedia = {
|
|
wiki_server = "https://%s.wikipedia.org",
|
|
wiki_path = "/w/api.php",
|
|
default_lang = "en",
|
|
-- See https://www.mediawiki.org/wiki/API:Main_page for details.
|
|
-- Search query, returns introductory texts (+ main thumbnail image)
|
|
wiki_search_params = {
|
|
action = "query",
|
|
generator = "search",
|
|
gsrnamespace = "0",
|
|
-- gsrsearch = nil, -- text to lookup, will be added below
|
|
gsrlimit = 20, -- max nb of results to get
|
|
exlimit = "max",
|
|
prop = "extracts|info|pageimages", -- 'extracts' to get text, 'info' to get full page length
|
|
format = "json",
|
|
explaintext = "",
|
|
exintro = "",
|
|
-- We have to use 'exintro=' to get extracts for ALL results
|
|
-- (otherwise, we get the full text for only the first result, and
|
|
-- no text at all for the others
|
|
},
|
|
-- Full article, parsed to output text (+ main thumbnail image)
|
|
wiki_full_params = {
|
|
action = "query",
|
|
prop = "extracts|pageimages",
|
|
format = "json",
|
|
-- exintro = nil, -- get more than only the intro
|
|
explaintext = "",
|
|
redirects = "",
|
|
-- title = nil, -- text to lookup, will be added below
|
|
},
|
|
-- Full article, parsed to output HTML, for Save as EPUB
|
|
wiki_phtml_params = {
|
|
action = "parse",
|
|
format = "json",
|
|
-- we only need the following informations
|
|
prop = "text|sections|displaytitle|revid",
|
|
-- page = nil, -- text to lookup, will be added below
|
|
-- disabletoc = "", -- if we want to remove toc IN html
|
|
disablelimitreport = "",
|
|
disableeditsection = "",
|
|
},
|
|
-- Full article, parsed to output HTML, for images extraction
|
|
-- (used with full article as text, if "show more images" enabled)
|
|
wiki_images_params = { -- same as previous one, with just text html
|
|
action = "parse",
|
|
format = "json",
|
|
-- we only need the following informations
|
|
prop = "text",
|
|
-- page = nil, -- text to lookup, will be added below
|
|
redirects = "",
|
|
disabletoc = "", -- remove toc in html
|
|
disablelimitreport = "",
|
|
disableeditsection = "",
|
|
},
|
|
-- There is an alternative for obtaining page's images:
|
|
-- prop=imageinfo&action=query&iiprop=url|dimensions|mime|extmetadata&generator=images&pageids=49448&iiurlwidth=100&iiextmetadatafilter=ImageDescription
|
|
-- but it gives all images (including wikipedia icons) in any order, without
|
|
-- any score or information that would help considering if they matter or not
|
|
--
|
|
|
|
-- Allow for disabling prettifying full page text
|
|
wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"),
|
|
|
|
-- Can be set so HTTP requests will be done under Trapper and
|
|
-- be interruptible
|
|
trap_widget = nil,
|
|
-- For actions done with Trapper:dismissable methods, we may throw
|
|
-- and error() with this code. We make the value of this error
|
|
-- accessible here so that caller can know it's a user dismiss.
|
|
dismissed_error_code = "Interrupted by user",
|
|
}
|
|
|
|
function Wikipedia:getWikiServer(lang)
|
|
return string.format(self.wiki_server, lang or self.default_lang)
|
|
end
|
|
|
|
-- Codes that getUrlContent may get from requester.request()
|
|
local TIMEOUT_CODE = "timeout" -- from socket.lua
|
|
local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime
|
|
|
|
-- Sink that stores into a table, aborting if maxtime has elapsed
|
|
local function sink_table_with_maxtime(t, maxtime)
|
|
-- Start counting as soon as this sink is created
|
|
local start_secs, start_usecs = ffiutil.gettime()
|
|
local starttime = start_secs + start_usecs/1000000
|
|
t = t or {}
|
|
local f = function(chunk, err)
|
|
local secs, usecs = ffiutil.gettime()
|
|
if secs + usecs/1000000 - starttime > maxtime then
|
|
return nil, MAXTIME_CODE
|
|
end
|
|
if chunk then table.insert(t, chunk) end
|
|
return 1
|
|
end
|
|
return f, t
|
|
end
|
|
|
|
-- Get URL content
|
|
local function getUrlContent(url, timeout, maxtime)
|
|
local socket = require('socket')
|
|
local ltn12 = require('ltn12')
|
|
local http = require('socket.http')
|
|
local https = require('ssl.https')
|
|
|
|
local requester
|
|
if url:sub(1,7) == "http://" then
|
|
requester = http
|
|
elseif url:sub(1,8) == "https://" then
|
|
requester = https
|
|
else
|
|
return false, "Unsupported protocol"
|
|
end
|
|
if not timeout then timeout = 10 end
|
|
-- timeout needs to be set to 'http', even if we use 'https'
|
|
http.TIMEOUT, https.TIMEOUT = timeout, timeout
|
|
|
|
local request = {}
|
|
local sink = {}
|
|
request['url'] = url
|
|
request['method'] = 'GET'
|
|
-- 'timeout' delay works on socket, and is triggered when
|
|
-- that time has passed trying to connect, or after connection
|
|
-- when no data has been read for this time.
|
|
-- On a slow connection, it may not be triggered (as we could read
|
|
-- 1 byte every 1 second, not triggering any timeout).
|
|
-- 'maxtime' can be provided to overcome that, and we start counting
|
|
-- as soon as the first content byte is received (but it is checked
|
|
-- for only when data is received).
|
|
-- Setting 'maxtime' and 'timeout' gives more chance to abort the request when
|
|
-- it takes too much time (in the worst case: in timeout+maxtime seconds).
|
|
-- But time taken by DNS lookup cannot easily be accounted for, so
|
|
-- a request may (when dns lookup takes time) exceed timeout and maxtime...
|
|
if maxtime then
|
|
request['sink'] = sink_table_with_maxtime(sink, maxtime)
|
|
else
|
|
request['sink'] = ltn12.sink.table(sink)
|
|
end
|
|
|
|
local code, headers, status = socket.skip(1, requester.request(request))
|
|
local content = table.concat(sink) -- empty or content accumulated till now
|
|
-- logger.dbg("code:", code)
|
|
-- logger.dbg("headers:", headers)
|
|
-- logger.dbg("status:", status)
|
|
-- logger.dbg("#content:", #content)
|
|
|
|
if code == TIMEOUT_CODE or code == MAXTIME_CODE then
|
|
logger.warn("request interrupted:", code)
|
|
return false, code
|
|
end
|
|
if headers == nil then
|
|
logger.warn("No HTTP headers:", code, status)
|
|
return false, "Network or remote server unavailable"
|
|
end
|
|
if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK
|
|
logger.warn("HTTP status not okay:", code, status)
|
|
return false, "Remote server error or unavailable"
|
|
end
|
|
if headers and headers["content-length"] then
|
|
-- Check we really got the announced content size
|
|
local content_length = tonumber(headers["content-length"])
|
|
if #content ~= content_length then
|
|
return false, "Incomplete content received"
|
|
end
|
|
end
|
|
return true, content
|
|
end
|
|
|
|
function Wikipedia:setTrapWidget(trap_widget)
|
|
self.trap_widget = trap_widget
|
|
end
|
|
|
|
function Wikipedia:resetTrapWidget()
|
|
self.trap_widget = nil
|
|
end
|
|
|
|
-- Possible values for page_type parameter to loadPage()
|
|
local WIKIPEDIA_INTRO = 1
|
|
local WIKIPEDIA_FULL = 2
|
|
local WIKIPEDIA_PHTML = 3
|
|
local WIKIPEDIA_IMAGES = 4
|
|
|
|
--[[
|
|
-- return decoded JSON table from Wikipedia
|
|
--]]
|
|
function Wikipedia:loadPage(text, lang, page_type, plain)
|
|
local url = require('socket.url')
|
|
local query = ""
|
|
local parsed = url.parse(self:getWikiServer(lang))
|
|
parsed.path = self.wiki_path
|
|
if page_type == WIKIPEDIA_INTRO then -- search query
|
|
self.wiki_search_params.explaintext = plain and "" or nil
|
|
for k,v in pairs(self.wiki_search_params) do
|
|
query = string.format("%s%s=%s&", query, k, v)
|
|
end
|
|
parsed.query = query .. "gsrsearch=" .. url.escape(text)
|
|
elseif page_type == WIKIPEDIA_FULL then -- full page content
|
|
self.wiki_full_params.explaintext = plain and "" or nil
|
|
for k,v in pairs(self.wiki_full_params) do
|
|
query = string.format("%s%s=%s&", query, k, v)
|
|
end
|
|
parsed.query = query .. "titles=" .. url.escape(text)
|
|
elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content
|
|
for k,v in pairs(self.wiki_phtml_params) do
|
|
query = string.format("%s%s=%s&", query, k, v)
|
|
end
|
|
parsed.query = query .. "page=" .. url.escape(text)
|
|
elseif page_type == WIKIPEDIA_IMAGES then -- images found in page html
|
|
for k,v in pairs(self.wiki_images_params) do
|
|
query = string.format("%s%s=%s&", query, k, v)
|
|
end
|
|
parsed.query = query .. "page=" .. url.escape(text)
|
|
else
|
|
return
|
|
end
|
|
|
|
local built_url = url.build(parsed)
|
|
local completed, success, content
|
|
if self.trap_widget then -- if previously set with Wikipedia:setTrapWidget()
|
|
local Trapper = require("ui/trapper")
|
|
local timeout, maxtime = 30, 60
|
|
-- We use dismissableRunInSubprocess with complex return values:
|
|
completed, success, content = Trapper:dismissableRunInSubprocess(function()
|
|
return getUrlContent(built_url, timeout, maxtime)
|
|
end, self.trap_widget)
|
|
if not completed then
|
|
error(self.dismissed_error_code) -- "Interrupted by user"
|
|
end
|
|
else
|
|
local timeout, maxtime = 10, 60
|
|
success, content = getUrlContent(built_url, timeout, maxtime)
|
|
end
|
|
if not success then
|
|
error(content)
|
|
end
|
|
|
|
if content ~= "" and string.sub(content, 1,1) == "{" then
|
|
local ok, result = pcall(JSON.decode, content)
|
|
if ok and result then
|
|
logger.dbg("wiki result json:", result)
|
|
return result
|
|
else
|
|
logger.warn("wiki result json decoding error:", result)
|
|
error("Failed decoding JSON")
|
|
end
|
|
else
|
|
logger.warn("wiki response is not json:", content)
|
|
error("Response is not JSON")
|
|
end
|
|
end
|
|
|
|
-- search wikipedia and get intros for results
|
|
function Wikipedia:searchAndGetIntros(text, lang)
|
|
local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true)
|
|
if result then
|
|
local query = result.query
|
|
if query then
|
|
local show_image = G_reader_settings:nilOrTrue("wikipedia_show_image")
|
|
-- Scale wikipedia normalized (we hope) thumbnail by 2 (adjusted
|
|
-- to screen size/dpi) for intros (and x8 more for highres image)
|
|
local image_size_factor = Screen:scaleBySize(200)/100.0
|
|
if show_image then
|
|
for pageid, page in pairs(query.pages) do
|
|
self:addImages(page, lang, false, image_size_factor, 8)
|
|
end
|
|
end
|
|
return query.pages
|
|
end
|
|
end
|
|
end
|
|
|
|
-- get full content of a wiki page
|
|
function Wikipedia:getFullPage(wiki_title, lang)
|
|
local result = self:loadPage(wiki_title, lang, WIKIPEDIA_FULL, true)
|
|
if result then
|
|
local query = result.query
|
|
if query then
|
|
local show_image = G_reader_settings:nilOrTrue("wikipedia_show_image")
|
|
local show_more_images = G_reader_settings:nilOrTrue("wikipedia_show_more_images")
|
|
-- Scale wikipedia normalized (we hope) thumbnails by 4 (adjusted
|
|
-- to screen size/dpi) for full page (and this *4 for highres image)
|
|
local image_size_factor = Screen:scaleBySize(400)/100.0
|
|
if self.wiki_prettify or show_image then
|
|
for pageid, page in pairs(query.pages) do
|
|
if self.wiki_prettify and page.extract then
|
|
-- Prettification of the plain text full page
|
|
page.extract = self:prettifyText(page.extract)
|
|
end
|
|
if show_image then
|
|
self:addImages(page, lang, show_more_images, image_size_factor, 4)
|
|
end
|
|
end
|
|
end
|
|
return query.pages
|
|
end
|
|
end
|
|
end
|
|
|
|
-- get parsed html content and other infos of a wiki page
|
|
function Wikipedia:getFullPageHtml(wiki_title, lang)
|
|
local result = self:loadPage(wiki_title, lang, WIKIPEDIA_PHTML, true)
|
|
if result and result.parse then
|
|
return result.parse
|
|
end
|
|
if result.error and result.error.info then
|
|
error(result.error.info)
|
|
end
|
|
end
|
|
|
|
-- get images extracted from parsed html
|
|
function Wikipedia:getFullPageImages(wiki_title, lang)
|
|
local images = {} -- will be returned, each in a format similar to page.thumbnail
|
|
local result = self:loadPage(wiki_title, lang, WIKIPEDIA_IMAGES, true)
|
|
if result and result.parse and result.parse.text and result.parse.text["*"] then
|
|
local html = result.parse.text["*"] -- html content
|
|
local url = require('socket.url')
|
|
local wiki_base_url = self:getWikiServer(lang)
|
|
|
|
local thumbs = {} -- bits of HTML containing an image
|
|
-- We first try to catch images in <div class=thumbinner>, which should exclude
|
|
-- wikipedia icons, flags... These seem to all end with a double </div>.
|
|
for thtml in html:gmatch([[<div class="thumbinner".-</div>%s*</div>]]) do
|
|
table.insert(thumbs, thtml)
|
|
end
|
|
-- We then also try to catch images in galleries (which often are less
|
|
-- interesting than those in thumbinner) as a 2nd set.
|
|
for thtml in html:gmatch([[<li class="gallerybox".-<div class="thumb".-</div>%s*</div>%s*<div class="gallerytext">.-</div>%s*</div>]]) do
|
|
table.insert(thumbs, thtml)
|
|
end
|
|
-- We may miss some interesting images in the page's top right table, but
|
|
-- there's no easy way to distinguish them from icons/flags in this table...
|
|
|
|
for _, thtml in ipairs(thumbs) do
|
|
-- We get <a href="/wiki/File:real_file_name.jpg (or /wiki/Fichier:real_file_name.jpg
|
|
-- depending on Wikipedia lang)
|
|
local filename = thtml:match([[<a href="/wiki/[^:]*:([^"]*)" class="image"]])
|
|
if filename then
|
|
filename = url.unescape(filename)
|
|
end
|
|
logger.dbg("found image with filename:", filename)
|
|
-- logger.dbg(thtml)
|
|
local timg, tremain = thtml:match([[(<img .->)(.*)]])
|
|
if timg and tremain then
|
|
-- (Should we discard those without caption ?)
|
|
local caption = tremain and util.htmlToPlainText(tremain)
|
|
if caption == "" then caption = nil end
|
|
logger.dbg(" caption:", caption)
|
|
-- logger.dbg(timg)
|
|
local src = timg:match([[src="([^"]*)"]])
|
|
if src and src ~= "" then
|
|
if src:sub(1,2) == "//" then
|
|
src = "https:" .. src
|
|
elseif src:sub(1,1) == "/" then -- non absolute url
|
|
src = wiki_base_url .. src
|
|
end
|
|
local width = tonumber(timg:match([[width="([^"]*)"]]))
|
|
local height = tonumber(timg:match([[height="([^"]*)"]]))
|
|
-- Ignore img without width and height, which should exlude
|
|
-- javascript maps and other unsupported stuff
|
|
if width and height then
|
|
-- Images in the html we got seem to be x4.5 the size of
|
|
-- the thumbnail we get with searchAndGetIntros() or
|
|
-- getFullPage(). Normalize them to the size of the thumbnail,
|
|
-- so we can resize them all later with the same rules.
|
|
width = math.ceil(width/4.5)
|
|
height = math.ceil(height/4.5)
|
|
-- No need to adjust width in src url here, as it will be
|
|
-- done in addImages() anyway
|
|
-- src = src:gsub("(.*/)%d+(px-[^/]*)", "%1"..width.."%2")
|
|
logger.dbg(" size:", width, "x", height, "url:", src)
|
|
table.insert(images, {
|
|
source = src,
|
|
width = width,
|
|
height = height,
|
|
filename = filename,
|
|
caption = caption,
|
|
})
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
return images
|
|
end
|
|
|
|
-- Function wrapped and plugged to image objects returned by :addImages()
|
|
local function image_load_bb_func(image, highres)
|
|
local source, trap_widget
|
|
if not highres then
|
|
-- We use an invisible widget that will resend the dismiss event,
|
|
-- so that image loading in TextBoxWdiget is unobtrusive and
|
|
-- interruptible
|
|
trap_widget = false
|
|
source = image.source
|
|
else
|
|
-- We need to let the user know image loading is happening,
|
|
-- with a discreet TrapWidget
|
|
trap_widget = _("Loading high-res image… (tap to cancel)")
|
|
source = image.hi_source
|
|
end
|
|
-- Image may be big or take some time to be resized on wikipedia servers.
|
|
-- As we use dismissableRunInSubprocess and can interrupt this loading,
|
|
-- we can use quite high timeouts
|
|
local timeout, maxtime = 60, 120
|
|
|
|
logger.dbg("fetching", source)
|
|
local Trapper = require("ui/trapper")
|
|
-- We use dismissableRunInSubprocess with simple string return value to
|
|
-- avoid dump()/load() a long string of image bytes
|
|
local completed, data = Trapper:dismissableRunInSubprocess(function()
|
|
local success, data = getUrlContent(source, timeout, maxtime)
|
|
-- With simple string value, we're not able to return the failure
|
|
-- reason, so log it here
|
|
if not success then
|
|
logger.warn("failed fetching image from", source, ":", data)
|
|
end
|
|
return success and data or nil
|
|
end, trap_widget, true) -- task_returns_simple_string=true
|
|
|
|
local success = data and true or false -- guess success from data
|
|
|
|
if not completed then
|
|
logger.dbg("image fetching interrupted by user")
|
|
return true -- let caller know it was interrupted
|
|
end
|
|
if not success then
|
|
-- log it again (on Android, log from sub-process seem to not work)
|
|
logger.warn("failed fetching image from", source)
|
|
return
|
|
end
|
|
logger.dbg(" fetched", #data)
|
|
|
|
local bb
|
|
if not highres then
|
|
-- For low-res, we should ensure the image we got from wikipedia is
|
|
-- the right size, so it does not overflow our reserved area
|
|
-- (TextBoxWidget may have adjusted image.width and height)
|
|
-- We don't get animated GIF multiple frames to keep TextBoxWidget
|
|
-- simple: they will be available when viewed in highres
|
|
bb = RenderImage:renderImageData(data, #data, false, image.width, image.height)
|
|
else
|
|
-- We provide want_frames=true for highres images, so ImageViewer
|
|
-- can display animated GIF
|
|
-- No need for width and height for high-res
|
|
bb = RenderImage:renderImageData(data, #data, true)
|
|
end
|
|
if not bb then
|
|
logger.warn("failed building image from", source)
|
|
return
|
|
end
|
|
if not highres then
|
|
image.bb = bb
|
|
else
|
|
image.hi_bb = bb
|
|
end
|
|
end
|
|
|
|
function Wikipedia:addImages(page, lang, more_images, image_size_factor, hi_image_size_factor)
|
|
-- List of images, table with keys as expected by TextBoxWidget
|
|
page.images = {}
|
|
-- List of wikipedia images data structures (page.thumbnail and images
|
|
-- extracted from html) made to have the same keys for common processing
|
|
local wimages = {}
|
|
|
|
-- We got what Wikipedia scored as the most interesting image for this
|
|
-- page in page.thumbnail, and its filename in page.pageimage, ie:
|
|
-- "thumbnail": {
|
|
-- "source": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/45/Reading_on_the_bus_train_or_transit.jpg/37px-Reading_on_the_bus_train_or_transit.jpg",
|
|
-- "width": 37,
|
|
-- "height": 50
|
|
-- },
|
|
-- "pageimage": "Reading_on_the_bus_train_or_transit.jpg"
|
|
--
|
|
local first_image_filename = nil
|
|
if page.thumbnail and page.thumbnail.source then
|
|
page.thumbnail.filename = page.pageimage
|
|
first_image_filename = page.pageimage
|
|
table.insert(wimages, page.thumbnail)
|
|
end
|
|
-- To get more images, we need to make a second request to wikipedia
|
|
if more_images then
|
|
local ok, images_or_err = pcall(Wikipedia.getFullPageImages, Wikipedia, page.title, lang)
|
|
if not ok then
|
|
logger.warn("error getting more images", images_or_err)
|
|
else
|
|
for _, wimage in ipairs(images_or_err) do
|
|
if first_image_filename and wimage.filename == first_image_filename then
|
|
-- We got the same image as the thumbnail one, but it may have
|
|
-- a caption: replace thumbnail one with this one
|
|
table.remove(wimages, 1)
|
|
table.insert(wimages, 1, wimage)
|
|
else
|
|
table.insert(wimages, wimage)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
-- All our wimages now have the keys: source, width, height, filename, caption
|
|
for _, wimage in ipairs(wimages) do
|
|
-- We trust wikipedia, and our x4.5 factor in :getFullPageImages(), for adequate
|
|
-- and homogeneous images' sizes. We'll just scale them according to the
|
|
-- provided 'image_size_factor' (which should account for screen size/DPI)
|
|
local width = wimage.width or 100 -- in case we don't get any width or height
|
|
local height = wimage.height or 100
|
|
-- Give a little boost in size to thin images
|
|
if width < height / 2 or height < width / 2 then
|
|
width = width * 1.3
|
|
height = height * 1.3
|
|
end
|
|
width = math.ceil(width * image_size_factor)
|
|
height = math.ceil(height * image_size_factor)
|
|
-- All wikipedia image urls like .../wikipedia/commons/A/BC/<filename>
|
|
-- or .../wikipedia/commons/thumb/A/BC/<filename>/<width>px-<filename>
|
|
-- can be transformed to another url with a requested new_width with the form:
|
|
-- /wikipedia/commons/thumb/A/BC/<filename>/<new_width>px-<filename>
|
|
-- (Additionally, the image format can be changed by appending .png,
|
|
-- .jpg or .gif to it)
|
|
-- The resize is so done on Wikipedia servers from the source image for
|
|
-- the best quality.
|
|
local source = wimage.source:gsub("(.*/)%d+(px-[^/]*)", "%1"..width.."%2")
|
|
-- We build values for a high resolution version of the image, to be displayed
|
|
-- with ImageViewer (x 4 by default)
|
|
local hi_width = width * (hi_image_size_factor or 4)
|
|
local hi_height = height * (hi_image_size_factor or 4)
|
|
local hi_source = wimage.source:gsub("(.*/)%d+(px-[^/]*)", "%1"..hi_width.."%2")
|
|
local title = wimage.filename
|
|
if title then
|
|
title = title:gsub("_", " ")
|
|
end
|
|
local image = {
|
|
-- As expected by TextBoxWidget (with additional source and
|
|
-- hi_source, that will be used by load_bb_func)
|
|
title = title,
|
|
caption = wimage.caption,
|
|
source = source,
|
|
width = width,
|
|
height = height,
|
|
bb = nil, -- will be loaded and build only if needed
|
|
hi_source = hi_source,
|
|
hi_width = hi_width,
|
|
hi_height = hi_height,
|
|
hi_bb = nil, -- will be loaded and build only if needed
|
|
}
|
|
-- If bb or hi_bb is nil, TextBoxWidget will call a method named "load_bb_func"
|
|
image.load_bb_func = function(highres)
|
|
return image_load_bb_func(image, highres)
|
|
end
|
|
table.insert(page.images, image)
|
|
end
|
|
end
|
|
|
|
-- UTF8 of unicode geometrical shapes we can use to replace
|
|
-- the "=== title ===" of wkipedia plaintext pages
|
|
-- These chosen ones are available in most fonts (prettier symbols
|
|
-- exist in unicode, but are available in a few fonts only) and
|
|
-- have a quite consistent size/weight in all fonts.
|
|
local th1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?)
|
|
local th2_sym = "\xE2\x96\x89" -- big black square
|
|
local th3_sym = "\xC2\xA0\xE2\x97\xA4" -- black upper left triangle (indented, nicer)
|
|
local th4_sym = "\xE2\x97\x86" -- black diamond
|
|
local th5_sym = "\xE2\x9C\xBF" -- black florette
|
|
local th6_sym = "\xE2\x9D\x96" -- black diamond minus white x
|
|
-- Others available in most fonts
|
|
-- local thX_sym = "\xE2\x9C\x9A" -- heavy greek cross
|
|
-- local thX_sym = "\xE2\x97\xA2" -- black lower right triangle
|
|
-- local thX_sym = "\xE2\x97\x89" -- fish eye
|
|
-- local thX_sym = "\xE2\x96\x97" -- quadrant lower right
|
|
|
|
-- For optional prettification of the plain text full page
|
|
function Wikipedia:prettifyText(text)
|
|
-- We use \a for an additional leading \n that we don't want shortened later
|
|
text = text:gsub("\n= ", "\n\a"..th1_sym.." ") -- 2 empty lines before
|
|
text = text:gsub("\n== ", "\n\a"..th2_sym.." ") -- 2 empty lines before
|
|
text = text:gsub("\n=== ", "\n"..th3_sym.." ")
|
|
text = text:gsub("\n==== ", "\n"..th4_sym.." ")
|
|
text = text:gsub("\n===== ", "\n"..th5_sym.." ")
|
|
text = text:gsub("\n====== ", "\n"..th6_sym.." ")
|
|
text = text:gsub("Modifier ==", " ==") -- fr wikipedia fix for some articles modified by clumsy editors
|
|
text = text:gsub("==$", "==\n") -- for a </hN> at end of text to be matched by next gsub
|
|
text = text:gsub(" ===?\n+", "\n\n") -- </h2> to </h3> : empty line after
|
|
text = text:gsub(" ====+\n+", "\n") -- </h4> to </hN> : single \n, no empty line
|
|
text = text:gsub("\n\n+\xE2\x80\x94", "\n\xE2\x80\x94") -- em dash, used for quote author, make it stick to prev text
|
|
text = text:gsub("\n +\n", "\n") -- trim lines full of only spaces (often seen in math formulas)
|
|
text = text:gsub("^\n*", "") -- trim new lines at start
|
|
text = text:gsub("\n*$", "") -- trim new lines at end
|
|
text = text:gsub("\n\n+", "\n\n") -- shorten multiple new lines
|
|
text = text:gsub("\a", "\n") -- re-add our wished \n
|
|
return text
|
|
end
|
|
|
|
|
|
-- UTF8 of unicode geometrical shapes we'll prepend to wikipedia section headers,
|
|
-- to help identifying hierarchy (othewise, the small font size differences helps).
|
|
-- Best if identical to the ones used above for prettifying full plain text page.
|
|
-- These chosen ones are available in most fonts (prettier symbols
|
|
-- exist in unicode, but are available in a few fonts only) and
|
|
-- have a quite consistent size/weight in all fonts.
|
|
local h1_sym = "\xE2\x96\x88" -- full block (big black rectangle) (never met, only for web page title?)
|
|
local h2_sym = "\xE2\x96\x89" -- big black square
|
|
local h3_sym = "\xE2\x97\xA4" -- black upper left triangle
|
|
local h4_sym = "\xE2\x97\x86" -- black diamond
|
|
local h5_sym = "\xE2\x9C\xBF" -- black florette
|
|
local h6_sym = "\xE2\x9D\x96" -- black diamond minus white x
|
|
-- Other available ones in most fonts
|
|
-- local hXsym = "\xE2\x9C\x9A" -- heavy greek cross
|
|
-- local hXsym = "\xE2\x97\xA2" -- black lower right triangle
|
|
-- local hXsym = "\xE2\x97\x89" -- fish eye
|
|
-- local hXsym = "\xE2\x96\x97" -- quadrant lower right
|
|
|
|
local ext_to_mimetype = {
|
|
png = "image/png",
|
|
jpg = "image/jpeg",
|
|
jpeg = "image/jpeg",
|
|
gif = "image/gif",
|
|
svg = "image/svg+xml",
|
|
html= "application/xhtml+xml",
|
|
xhtml= "application/xhtml+xml",
|
|
ncx = "application/x-dtbncx+xml",
|
|
js = "text/javascript",
|
|
css = "text/css",
|
|
otf = "application/opentype",
|
|
ttf = "application/truetype",
|
|
woff = "application/font-woff",
|
|
}
|
|
|
|
|
|
-- Create an epub file (with possibly images)
|
|
function Wikipedia:createEpub(epub_path, page, lang, with_images)
|
|
-- Use Trapper to display progress and ask questions through the UI.
|
|
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
|
|
-- Trapper:info() and Trapper:confirm() will just use logger.
|
|
local UI = require("ui/trapper")
|
|
|
|
UI:info(_("Retrieving Wikipedia article…"))
|
|
local ok, phtml = pcall(self.getFullPageHtml, self, page, lang)
|
|
if not ok then
|
|
UI:info(phtml) -- display error in InfoMessage
|
|
-- Sleep a bit to make that error seen
|
|
ffiutil.sleep(2)
|
|
UI:reset()
|
|
return false
|
|
end
|
|
|
|
-- We may need to build absolute urls for non-absolute links and images urls
|
|
local wiki_base_url = self:getWikiServer(lang)
|
|
|
|
-- Get infos from wikipedia result
|
|
-- (see example at https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection)
|
|
local cancelled = false
|
|
local html = phtml.text["*"] -- html content
|
|
local page_cleaned = page:gsub("_", " ") -- page title
|
|
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
|
|
-- We need to encode plain '&' in those so we can put them in XML/HTML
|
|
-- We wouldn't need to escape as many as util.htmlEntitiesToUtf8() does, but
|
|
-- we need to to not mess existing ones (' ' may happen) with our '&'
|
|
-- encodes. (We don't escape < or > as these JSON strings may contain HTML tags)
|
|
page_cleaned = util.htmlEntitiesToUtf8(page_cleaned):gsub("&", "&")
|
|
page_htmltitle = util.htmlEntitiesToUtf8(page_htmltitle):gsub("&", "&")
|
|
local sections = phtml.sections -- Wikipedia provided TOC
|
|
local bookid = string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
|
|
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
|
|
-- should it changes if content is updated (as now, including the wikipedia revisionId),
|
|
-- or should it stays the same even if revid changes (content of the same book updated).
|
|
|
|
-- We need to find images in HTML to tell how many when asking user if they should be included
|
|
local images = {}
|
|
local seen_images = {}
|
|
local imagenum = 1
|
|
local cover_imgid = nil -- best candidate for cover among our images
|
|
local processImg = function(img_tag)
|
|
local src = img_tag:match([[src="([^"]*)"]])
|
|
if src == nil or src == "" then
|
|
logger.info("no src found in ", img_tag)
|
|
return nil
|
|
end
|
|
if src:sub(1,2) == "//" then
|
|
src = "https:" .. src -- Wikipedia redirects from http to https, so use https
|
|
elseif src:sub(1,1) == "/" then -- non absolute url
|
|
src = wiki_base_url .. src
|
|
end
|
|
-- Some SVG urls don't have any extension, like:
|
|
-- "/api/rest_v1/media/math/render/svg/154a342afea5a9f13caf1a5bb6acd5c4e69733b6""
|
|
-- Furthermore, as of early 2018, it looks like most (all?) mathematical SVG
|
|
-- obtained from such urls use features not supported by crengine's nanosvg
|
|
-- renderer (so, they are displayed as a blank square).
|
|
-- But we can get a PNG version of it thanks to wikipedia APIs :
|
|
-- https://wikimedia.org/api/rest_v1/#!/Math/get_media_math_render_format_hash
|
|
-- We tweak the url now (and fix the mimetype below), before checking for
|
|
-- duplicates in seen_images.
|
|
-- Think about disabling that when nanosvg gets better!
|
|
if src:find("/math/render/svg/") then
|
|
src = src:gsub("/math/render/svg/", "/math/render/png/")
|
|
end
|
|
local cur_image
|
|
if seen_images[src] then -- already seen
|
|
cur_image = seen_images[src]
|
|
else
|
|
local src_ext = src
|
|
if src_ext:find("?") then -- "/w/extensions/wikihiero/img/hiero_D22.png?0b8f1"
|
|
src_ext = src_ext:match("(.-)%?") -- remove ?blah
|
|
end
|
|
local ext = src_ext:match(".*%.(%S%S%S?%S?%S?)$") -- extensions are only 2 to 5 chars
|
|
if ext == nil or ext == "" then
|
|
if src_ext:find("/math/render/png/") then -- tweaked above
|
|
ext = "png"
|
|
else
|
|
-- we won't know what mimetype to use, ignore it
|
|
logger.info("no file extension found in ", src)
|
|
return nil
|
|
end
|
|
end
|
|
ext = ext:lower()
|
|
local imgid = string.format("img%05d", imagenum)
|
|
local imgpath = string.format("images/%s.%s", imgid, ext)
|
|
local mimetype = ext_to_mimetype[ext] or ""
|
|
local width = tonumber(img_tag:match([[width="([^"]*)"]]))
|
|
local height = tonumber(img_tag:match([[height="([^"]*)"]]))
|
|
-- Get higher resolution (2x) image url
|
|
local src2x = nil
|
|
local srcset = img_tag:match([[srcset="([^"]*)"]])
|
|
if srcset then
|
|
srcset = " "..srcset.. ", " -- for next pattern to possibly match 1st or last item
|
|
src2x = srcset:match([[ (%S+) 2x, ]])
|
|
if src2x then
|
|
if src2x:sub(1,2) == "//" then
|
|
src2x = "https:" .. src2x
|
|
elseif src2x:sub(1,1) == "/" then -- non absolute url
|
|
src2x = wiki_base_url .. src2x
|
|
end
|
|
end
|
|
end
|
|
cur_image = {
|
|
imgid = imgid,
|
|
imgpath = imgpath,
|
|
src = src,
|
|
src2x = src2x,
|
|
mimetype = mimetype,
|
|
width = width,
|
|
height = height,
|
|
}
|
|
table.insert(images, cur_image)
|
|
seen_images[src] = cur_image
|
|
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
|
|
if not cover_imgid and width and width > 50 and height and height > 50 and height > width then
|
|
cover_imgid = imgid
|
|
end
|
|
imagenum = imagenum + 1
|
|
end
|
|
-- crengine will NOT use width and height attributes, but it will use
|
|
-- those found in a style attribute.
|
|
-- If we get src2x images, crengine will scale them down to the 1x image size
|
|
-- (less space wasted by images while reading), but the 2x quality will be
|
|
-- there when image is viewed full screen with ImageViewer widget.
|
|
local style_props = {}
|
|
if cur_image.width then
|
|
table.insert(style_props, string.format("width: %spx", cur_image.width))
|
|
end
|
|
if cur_image.height then
|
|
table.insert(style_props, string.format("height: %spx", cur_image.height))
|
|
end
|
|
local style = table.concat(style_props, "; ")
|
|
return string.format([[<img src="%s" style="%s" alt=""/>]], cur_image.imgpath, style)
|
|
end
|
|
html = html:gsub("(<%s*img [^>]*>)", processImg)
|
|
logger.dbg("Images found in html:", images)
|
|
|
|
-- See what to do with images
|
|
local include_images = false
|
|
local use_img_2x = false
|
|
if with_images then
|
|
-- If no UI (Trapper:wrap() not called), UI:confirm() will answer true
|
|
if #images > 0 then
|
|
include_images = UI:confirm(T(_("This article contains %1 images.\nWould you like to download and include them in the generated EPUB file?"), #images), _("Don't include"), _("Include"))
|
|
if include_images then
|
|
use_img_2x = UI:confirm(_("Would you like to use slightly higher quality images? This will result in a bigger file size."), _("Standard quality"), _("Higher quality"))
|
|
end
|
|
else
|
|
UI:info(_("This article does not contain any images."))
|
|
ffiutil.sleep(1) -- Let the user see that
|
|
end
|
|
end
|
|
if not include_images then
|
|
-- Remove img tags to avoid little blank squares of missing images
|
|
html = html:gsub("<%s*img [^>]*>", "")
|
|
-- We could remove the whole image container <div class="thumb"...> ,
|
|
-- but it's a lot of nested <div> and not easy to do.
|
|
-- So the user will see the image legends and know a bit about
|
|
-- the images he chose to not get.
|
|
end
|
|
|
|
UI:info(_("Building EPUB…"))
|
|
-- Open the zip file (with .tmp for now, as crengine may still
|
|
-- have a handle to the final epub_path, and we don't want to
|
|
-- delete a good one if we fail/cancel later)
|
|
local epub_path_tmp = epub_path .. ".tmp"
|
|
local ZipWriter = require("ffi/zipwriter")
|
|
local epub = ZipWriter:new{}
|
|
if not epub:open(epub_path_tmp) then
|
|
return false
|
|
end
|
|
|
|
-- We now create and add all the required epub files
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- /mimetype : always "application/epub+zip"
|
|
epub:add("mimetype", "application/epub+zip")
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- /META-INF/container.xml : always the same content
|
|
epub:add("META-INF/container.xml", [[
|
|
<?xml version="1.0"?>
|
|
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
|
<rootfiles>
|
|
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
|
</rootfiles>
|
|
</container>]])
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory)
|
|
-- Other possible items in this file that are of no interest to crengine :
|
|
-- In <manifest> :
|
|
-- <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
|
|
-- <item id="cover-image" href="images/cover.png" media-type="image/png"/>
|
|
-- (crengine only uses <meta name="cover" content="cover-image" /> to get the cover image)
|
|
-- In <spine toc="ncx"> :
|
|
-- <itemref idref="cover" linear="no"/>
|
|
-- And a <guide> section :
|
|
-- <guide>
|
|
-- <reference href="title.html" type="cover" title="Cover"/>
|
|
-- <reference href="toc.html" type="toc" title="Table of Contents" href="toc.html" />
|
|
-- </guide>
|
|
local koreader_version = "KOReader"
|
|
if lfs.attributes("git-rev", "mode") == "file" then
|
|
koreader_version = "KOReader "..io.open("git-rev", "r"):read()
|
|
end
|
|
local content_opf_parts = {}
|
|
-- head
|
|
local meta_cover = "<!-- no cover image -->"
|
|
if include_images and cover_imgid then
|
|
meta_cover = string.format([[<meta name="cover" content="%s"/>]], cover_imgid)
|
|
end
|
|
table.insert(content_opf_parts, string.format([[
|
|
<?xml version='1.0' encoding='utf-8'?>
|
|
<package xmlns="http://www.idpf.org/2007/opf"
|
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
unique-identifier="bookid" version="2.0">
|
|
<metadata>
|
|
<dc:title>%s</dc:title>
|
|
<dc:creator>Wikipedia %s</dc:creator>
|
|
<dc:identifier id="bookid">%s</dc:identifier>
|
|
<dc:language>%s</dc:language>
|
|
<dc:publisher>%s</dc:publisher>
|
|
%s
|
|
</metadata>
|
|
<manifest>
|
|
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
|
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
|
|
<item id="css" href="stylesheet.css" media-type="text/css"/>
|
|
]], page_cleaned, lang:upper(), bookid, lang, koreader_version, meta_cover))
|
|
-- images files
|
|
if include_images then
|
|
for inum, img in ipairs(images) do
|
|
table.insert(content_opf_parts, string.format([[ <item id="%s" href="%s" media-type="%s"/>%s]], img.imgid, img.imgpath, img.mimetype, "\n"))
|
|
end
|
|
end
|
|
-- tail
|
|
table.insert(content_opf_parts, [[
|
|
</manifest>
|
|
<spine toc="ncx">
|
|
<itemref idref="content"/>
|
|
</spine>
|
|
</package>
|
|
]])
|
|
epub:add("OEBPS/content.opf", table.concat(content_opf_parts))
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/stylesheet.css
|
|
-- crengine will use its own data/epub.css, we just add/fix a few styles
|
|
-- to look more alike wikipedia web pages (that the user can ignore
|
|
-- with "Embedded Style" off)
|
|
epub:add("OEBPS/stylesheet.css", [[
|
|
/* make section headers looks left aligned and avoid some page breaks */
|
|
h1, h2 {
|
|
page-break-before: always;
|
|
page-break-after: avoid;
|
|
text-align: left;
|
|
}
|
|
h3, h4, h5, h6 {
|
|
page-break-before: auto;
|
|
page-break-after: avoid;
|
|
text-align: left;
|
|
}
|
|
/* avoid page breaks around our centered titles on first page */
|
|
h1.koreaderwikifrontpage, h5.koreaderwikifrontpage {
|
|
page-break-before: avoid;
|
|
page-break-inside: avoid;
|
|
page-break-after: avoid;
|
|
text-align: center;
|
|
margin-top: 0em;
|
|
}
|
|
p.koreaderwikifrontpage {
|
|
font-style: italic;
|
|
font-size: 90%;
|
|
margin: 1em 2em 1em 2em;
|
|
}
|
|
hr.koreaderwikifrontpage {
|
|
margin-left: 20%;
|
|
margin-right: 20%;
|
|
margin-bottom: 1.2em;
|
|
}
|
|
/* So many links, make them look like normal text except for underline */
|
|
a {
|
|
display:inline;
|
|
text-decoration: underline;
|
|
color: black;
|
|
font-weight: normal;
|
|
}
|
|
/* No underline for links without their href that we removed */
|
|
a.newwikinonexistent {
|
|
text-decoration: none;
|
|
}
|
|
/* don't waste left margin for notes and list of pages */
|
|
ul, ol {
|
|
margin-left: 0em;
|
|
}
|
|
/* OL in Wikipedia pages may inherit their style-type from a wrapping div,
|
|
* ensure they fallback to decimal with inheritance */
|
|
body {
|
|
list-style-type: decimal;
|
|
}
|
|
ol.references {
|
|
list-style-type: inherit;
|
|
}
|
|
/* show a box around image thumbnails */
|
|
div.thumb {
|
|
border: dotted 1px black;
|
|
margin: 0.5em 2.5em 0.5em 2.5em;
|
|
padding: 0.5em 0.5em 0.2em 0.5em;
|
|
padding-top: ]].. (include_images and "0.5em" or "0.15em") .. [[;
|
|
text-align: center;
|
|
font-size: 90%;
|
|
page-break-inside: avoid;
|
|
}
|
|
/* these are contained in div.thumb, avoid page break in between them */
|
|
div.thumbcaption, div.magnify {
|
|
page-break-before: avoid;
|
|
}
|
|
/* show a box around image in gallery list (li.gallery
|
|
* is set up a bit differently than div.thumb - we try
|
|
* to make them look the same */
|
|
li.gallerybox {
|
|
list-style-type: none;
|
|
border: dotted 1px black;
|
|
margin: 0.5em 2.5em 0.5em 2.5em;
|
|
padding: 0.5em 0.5em 0.2em 0.5em;
|
|
padding-top: ]].. (include_images and "0.5em" or "0.15em") .. [[;
|
|
text-align: center;
|
|
font-size: 90%;
|
|
}
|
|
li.gallerybox div.thumb {
|
|
border: solid 1px white;
|
|
margin: 0;
|
|
padding: 0;
|
|
page-break-after: avoid;
|
|
}
|
|
/* override this one often set in style="" with various values */
|
|
li.gallerybox div.thumb div {
|
|
margin: 0 !important;
|
|
}
|
|
/* avoid page break between gallery image and text */
|
|
div.gallerytext {
|
|
page-break-before: avoid;
|
|
page-break-inside: avoid;
|
|
}
|
|
li.gallerybox div.gallerytext p {
|
|
text-align: center;
|
|
font-size: 90%;
|
|
}
|
|
.citation {
|
|
font-style: italic;
|
|
}
|
|
/* make tables full-width - Wikipedia tables are often set as float
|
|
* elements and have a fixed width, often in em (22em), which would
|
|
* make them quite small with blank space on their right, as we don't
|
|
* support float */
|
|
table {
|
|
width: 100% !important;
|
|
}
|
|
/* hide some view/edit/discuss short links displayed as "v m d" */
|
|
.nv-view, .nv-edit, .nv-talk {
|
|
display: none;
|
|
}
|
|
/* hiding .noprint may discard some interesting links */
|
|
]])
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/toc.ncx : table of content
|
|
local toc_ncx_parts = {}
|
|
local depth = 0
|
|
local cur_level = 0
|
|
local np_end = [[</navPoint>]]
|
|
local num = 1
|
|
-- Add our own first section for first page, with page name as title
|
|
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html"/>]], num, num, page_cleaned))
|
|
table.insert(toc_ncx_parts, np_end)
|
|
-- Wikipedia sections items seem to be already sorted by index, so no need to sort
|
|
for isec, s in ipairs(sections) do
|
|
num = num + 1
|
|
-- Some chars in headings are converted to html entities in the
|
|
-- wikipedia-generated HTML. We need to do the same in TOC links
|
|
-- for the links to be valid.
|
|
local s_anchor = s.anchor:gsub("&", "&"):gsub('"', """):gsub(">", ">"):gsub("<", "<")
|
|
local s_title = string.format("%s %s", s.number, s.line)
|
|
-- Titles may include <i> and other html tags: let's remove them as
|
|
-- our TOC can only display text
|
|
s_title = (s_title:gsub("(%b<>)", ""))
|
|
-- We need to do as for page_htmltitle above. But headings can contain
|
|
-- html entities for < and > that we need to put back as html entities
|
|
s_title = util.htmlEntitiesToUtf8(s_title):gsub("&", "&"):gsub(">", ">"):gsub("<", "<")
|
|
local s_level = s.toclevel
|
|
if s_level > depth then
|
|
depth = s_level -- max depth required in toc.ncx
|
|
end
|
|
if s_level == cur_level then
|
|
table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint
|
|
elseif s_level < cur_level then
|
|
table.insert(toc_ncx_parts, np_end) -- close same-level previous navPoint
|
|
while s_level < cur_level do -- close all in-between navPoint
|
|
table.insert(toc_ncx_parts, np_end)
|
|
cur_level = cur_level - 1
|
|
end
|
|
elseif s_level > cur_level + 1 then
|
|
-- a jump from level N to level N+2 or more ... should not happen
|
|
-- per epub spec, but we don't know about wikipedia...
|
|
-- so we create missing intermediate navPoints with same anchor as current section
|
|
while s_level > cur_level + 1 do
|
|
table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level))
|
|
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>-</text></navLabel><content src="content.html#%s"/>]], num, num, s_anchor))
|
|
cur_level = cur_level + 1
|
|
num = num + 1
|
|
end
|
|
-- elseif s_level == cur_level + 1 then
|
|
-- sublevel, nothing to close, nothing to add
|
|
end
|
|
cur_level = s_level
|
|
table.insert(toc_ncx_parts, "\n"..(" "):rep(cur_level)) -- indentation, in case a person looks at it
|
|
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html#%s"/>]], num, num, s_title, s_anchor))
|
|
end
|
|
-- close nested <navPoint>
|
|
while cur_level > 0 do
|
|
table.insert(toc_ncx_parts, np_end)
|
|
cur_level = cur_level - 1
|
|
end
|
|
-- Prepend NCX head
|
|
table.insert(toc_ncx_parts, 1, string.format([[
|
|
<?xml version='1.0' encoding='utf-8'?>
|
|
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
|
|
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
|
<head>
|
|
<meta name="dtb:uid" content="%s"/>
|
|
<meta name="dtb:depth" content="%s"/>
|
|
<meta name="dtb:totalPageCount" content="0"/>
|
|
<meta name="dtb:maxPageNumber" content="0"/>
|
|
</head>
|
|
<docTitle>
|
|
<text>%s</text>
|
|
</docTitle>
|
|
<navMap>
|
|
]], bookid, depth, page_cleaned))
|
|
-- Append NCX tail
|
|
table.insert(toc_ncx_parts, [[
|
|
</navMap>
|
|
</ncx>
|
|
]])
|
|
epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts))
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/content.html
|
|
-- Some small fixes to Wikipedia HTML to make crengine and the user happier
|
|
|
|
-- Most images are in a link to the image info page, which is a useless
|
|
-- external link for us, so let's remove this link.
|
|
html = html:gsub("<a[^>]*>%s*(<%s*img [^>]*>)%s*</a>", "%1")
|
|
|
|
-- TODO: do something for <li class="gallerybox"...> so they are no more
|
|
-- a <li> (crengine displays them one above the other) and can be displayed
|
|
-- side by side
|
|
|
|
-- For some <div class="thumb tright"> , which include nested divs, although
|
|
-- perfectly balanced, crengine seems to miss some closing </div> and we
|
|
-- end up having our image bordered box including the remaining main wiki text.
|
|
-- It looks like this code is supposed to deal with class= containing multiple
|
|
-- class names :
|
|
-- https://github.com/koreader/crengine/commit/0930ec7230e720c148fd6f231d69558832b4d53a
|
|
-- and that it may stumble on some cases.
|
|
-- It's all perfectly fine if we make all these div with a single class name
|
|
-- html = html:gsub([[<div class="thumb [^"]*">]], [[<div class="thumb">]])
|
|
--
|
|
-- But we may as well make all class= have a single name to avoid other problems
|
|
-- (no real risk with that, as we don't define any style for wikipedia class names,
|
|
-- except div.thumb that always appears first).
|
|
html = html:gsub([[(<[^>]* class="[^ "]+)%s+[^"]*"]], [[%1"]])
|
|
|
|
-- crengine seems to consider unknown tag as 'block' elements, so we may
|
|
-- want to remove or replace those that should be considered 'inline' elements
|
|
html = html:gsub("</?time[^>]*>", "")
|
|
|
|
-- crengine does not support the <math> family of tags for displaying formulas,
|
|
-- which results in lots of space taken by individual character in the formula, each
|
|
-- on a single line...
|
|
-- Also, usally, these <math> tags are followed by a <img> tag pointing to a
|
|
-- SVG version of the formula, that we took care earlier to change the url to
|
|
-- point to a PNG version of the formula (which is still not perfect, as it does
|
|
-- not adjust to the current html font size, but it is at least readable).
|
|
-- So, remove the whole <math>...</math> content
|
|
html = html:gsub([[<math xmlns="http://www.w3.org/1998/Math/MathML".-</math>]], "")
|
|
|
|
-- Fix internal wikipedia links with full server url (including lang) so
|
|
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
|
|
-- We need to remove any "?somekey=somevalue" from url (a real "?" part of the
|
|
-- wiki_page word would be encoded as %3F, but ReaderLink would get it decoded and
|
|
-- would not be able to distinguish them).
|
|
-- Do that first (need to be done first) for full links to other language wikipedias
|
|
local cleanOtherLangWikiPageTitle = function(wiki_lang, wiki_page)
|
|
wiki_page = wiki_page:gsub("%?.*", "")
|
|
return string.format([[href="https://%s.wikipedia.org/wiki/%s"]], wiki_lang, wiki_page)
|
|
end
|
|
html = html:gsub([[href="https?://([^%.]+).wikipedia.org/wiki/([^"]*)"]], cleanOtherLangWikiPageTitle)
|
|
-- Now, do it for same wikipedia short urls
|
|
local cleanWikiPageTitle = function(wiki_page)
|
|
wiki_page = wiki_page:gsub("%?.*", "")
|
|
return string.format([[href="%s/wiki/%s"]], wiki_base_url, wiki_page)
|
|
end
|
|
html = html:gsub([[href="/wiki/([^"]*)"]], cleanWikiPageTitle)
|
|
|
|
-- Remove href from links to non existant wiki page so they are not clickable :
|
|
-- <a href="/w/index.php?title=PageTitle&action=edit&redlink=1" class="new" title="PageTitle">PageTitle____on</a>
|
|
-- (removal of the href="" will make them non clickable)
|
|
html = html:gsub([[<a[^>]* class="new"[^>]*>]], [[<a class="newwikinonexistent">]])
|
|
|
|
-- Fix some other protocol-less links to wikipedia (href="//fr.wikipedia.org/w/index.php..)
|
|
html = html:gsub([[href="//]], [[href="https://]])
|
|
|
|
-- crengine does not return link if multiple class names in <a> (<a class="external text" href="">)
|
|
-- it would be no problem as we can't follow them, but when the user tap
|
|
-- on it, the tap is propagated to other widgets and page change happen...
|
|
-- html = html:gsub([[<a rel="nofollow" class="external text"]], [[<a rel="nofollow" class="externaltext"]])
|
|
-- html = html:gsub([[<a class="external text"]], [[<a class="externaltext"]])
|
|
-- Solved by our multiple class names suppression above
|
|
|
|
-- Avoid link being clickable before <a> (if it starts a line) or after </a> (if it
|
|
-- ends a line or a block) by wrapping it with U+200B ZERO WIDTH SPACE which will
|
|
-- make the DOM tree walking code to find a link stop at it.
|
|
-- html = html:gsub("(<[aA])", "\xE2\x80\x8B%1")
|
|
-- html = html:gsub("(</[aA]>)", "%1\xE2\x80\x8B")
|
|
-- Fixed in crengine lvtinydom.
|
|
|
|
if self.wiki_prettify then
|
|
-- Prepend some symbols to section titles for a better visual feeling of hierarchy
|
|
html = html:gsub("<h1>", "<h1> "..h1_sym.." ")
|
|
html = html:gsub("<h2>", "<h2> "..h2_sym.." ")
|
|
html = html:gsub("<h3>", "<h3> "..h3_sym.." ")
|
|
html = html:gsub("<h4>", "<h4> "..h4_sym.." ")
|
|
html = html:gsub("<h5>", "<h5> "..h5_sym.." ")
|
|
html = html:gsub("<h6>", "<h6> "..h6_sym.." ")
|
|
end
|
|
|
|
-- Note: in all the gsub patterns above, we used lowercase for tags and attributes
|
|
-- because it's how they are in wikipedia HTML and it makes the pattern simple.
|
|
-- If one day this changes, they'll have to be replaced with href => [Hh][Rr][Ee][Ff] ...
|
|
|
|
-- We can finally build the final HTML with some header of our own
|
|
local saved_on = T(_("Saved on %1"), os.date("%b %d, %Y %H:%M:%S"))
|
|
local online_version_htmllink = string.format([[<a href="%s/wiki/%s">%s</a>]], wiki_base_url, page:gsub(" ", "_"), _("online version"))
|
|
local see_online_version = T(_("See %1 for up-to-date content"), online_version_htmllink)
|
|
epub:add("OEBPS/content.html", string.format([[
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head>
|
|
<title>%s</title>
|
|
<link type="text/css" rel="stylesheet" href="stylesheet.css"/>
|
|
</head>
|
|
<body>
|
|
<h1 class="koreaderwikifrontpage">%s</h1>
|
|
<h5 class="koreaderwikifrontpage">Wikipedia %s</h5>
|
|
<p class="koreaderwikifrontpage">%s<br/>%s</p>
|
|
<hr class="koreaderwikifrontpage"/>
|
|
%s
|
|
</body>
|
|
</html>
|
|
]], page_cleaned, page_htmltitle, lang:upper(), saved_on, see_online_version, html))
|
|
|
|
-- Force a GC to free the memory we used till now (the second call may
|
|
-- help reclaim more memory).
|
|
collectgarbage()
|
|
collectgarbage()
|
|
|
|
-- ----------------------------------------------------------------
|
|
-- OEBPS/images/*
|
|
if include_images then
|
|
local nb_images = #images
|
|
for inum, img in ipairs(images) do
|
|
-- Process can be interrupted at this point between each image download
|
|
-- by tapping while the InfoMessage is displayed
|
|
-- We use the fast_refresh option from image #2 for a quicker download
|
|
local go_on = UI:info(T(_("Retrieving image %1 / %2 …"), inum, nb_images), inum >= 2)
|
|
if not go_on then
|
|
cancelled = true
|
|
break
|
|
end
|
|
local src = img.src
|
|
if use_img_2x and img.src2x then
|
|
src = img.src2x
|
|
end
|
|
logger.dbg("Getting img ", src)
|
|
local success, content = getUrlContent(src)
|
|
-- success, content = getUrlContent(src..".unexistant") -- to simulate failure
|
|
if success then
|
|
logger.dbg("success, size:", #content)
|
|
else
|
|
logger.info("failed fetching:", src)
|
|
end
|
|
if success then
|
|
-- Images do not need to be compressed, so spare some cpu cycles
|
|
local no_compression = true
|
|
if img.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text)
|
|
no_compression = false
|
|
end
|
|
epub:add("OEBPS/"..img.imgpath, content, no_compression)
|
|
else
|
|
go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue"))
|
|
if not go_on then
|
|
cancelled = true
|
|
break
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
-- Done with adding files
|
|
if cancelled then
|
|
if UI:confirm(_("Download did not complete.\nDo you want to create an EPUB with the already downloaded images?"), _("Don't create"), _("Create")) then
|
|
cancelled = false
|
|
end
|
|
end
|
|
if cancelled then
|
|
UI:info(_("Canceled. Cleaning up…"))
|
|
else
|
|
UI:info(_("Packing EPUB…"))
|
|
end
|
|
epub:close()
|
|
-- This was nearly a no-op, so sleep a bit to make that progress step seen
|
|
ffiutil.usleep(300000)
|
|
UI:reset() -- close last InfoMessage
|
|
|
|
if cancelled then
|
|
-- Build was cancelled, remove half created .epub
|
|
if lfs.attributes(epub_path_tmp, "mode") == "file" then
|
|
os.remove(epub_path_tmp)
|
|
end
|
|
return false
|
|
end
|
|
|
|
-- Finally move the .tmp to the final file
|
|
os.rename(epub_path_tmp, epub_path)
|
|
logger.info("successfully created:", epub_path)
|
|
|
|
-- Force a GC to free the memory we used (the second call may help
|
|
-- reclaim more memory).
|
|
collectgarbage()
|
|
collectgarbage()
|
|
return true
|
|
end
|
|
|
|
|
|
-- Wrap Wikipedia:createEpub() with UI progress info, provided
|
|
-- by Trapper module.
|
|
function Wikipedia:createEpubWithUI(epub_path, page, lang, result_callback)
|
|
-- To do any UI interaction while building the EPUB, we need
|
|
-- to use a coroutine, so that our code can be suspended while waiting
|
|
-- for user interaction, and resumed by UI widgets callbacks.
|
|
-- All this is hidden and done by Trapper with a simple API.
|
|
local Trapper = require("ui/trapper")
|
|
Trapper:wrap(function()
|
|
Trapper:setPausedText("Download paused")
|
|
-- If errors in Wikipedia:createEpub(), the coroutine (used by
|
|
-- Trapper) would just abort (no reader crash, no error logged).
|
|
-- So we use pcall to catch any errors, log it, and report
|
|
-- the failure via result_callback.
|
|
local ok, success = pcall(self.createEpub, self, epub_path, page, lang, true)
|
|
if ok and success then
|
|
result_callback(true)
|
|
else
|
|
Trapper:reset() -- close any last widget not cleaned if error
|
|
logger.warn("Wikipedia.createEpub pcall:", ok, success)
|
|
result_callback(false)
|
|
end
|
|
end)
|
|
end
|
|
|
|
return Wikipedia
|