2015-03-21 03:39:25 +00:00
local JSON = require ( " json " )
2016-12-29 08:10:38 +00:00
local logger = require ( " logger " )
2017-01-21 18:23:13 +00:00
local util = require ( " ffi/util " )
local _ = require ( " gettext " )
local T = require ( " ffi/util " ) . template
2014-08-20 06:41:45 +00:00
--[[
-- Query wikipedia using Wikimedia Web API.
2016-12-06 21:15:52 +00:00
-- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max
-- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader
2017-01-21 18:23:13 +00:00
--
-- To get parsed HTML :
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection
-- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse
2014-08-20 06:41:45 +00:00
--]]
local Wikipedia = {
2015-08-23 18:39:08 +00:00
wiki_server = " https://%s.wikipedia.org " ,
2014-08-20 06:41:45 +00:00
wiki_path = " /w/api.php " ,
wiki_params = {
action = " query " ,
prop = " extracts " ,
format = " json " ,
2016-12-06 21:15:52 +00:00
-- exintro = nil, -- get more than only the intro
2014-08-20 06:41:45 +00:00
explaintext = " " ,
redirects = " " ,
2016-12-06 21:15:52 +00:00
-- title = nil, -- text to lookup, will be added below
2014-08-20 06:41:45 +00:00
} ,
default_lang = " en " ,
2016-12-06 21:15:52 +00:00
-- Search query for better results
-- see https://www.mediawiki.org/wiki/API:Main_page
wiki_search_params = {
action = " query " ,
generator = " search " ,
gsrnamespace = " 0 " ,
-- gsrsearch = nil, -- text to lookup, will be added below
gsrlimit = 20 , -- max nb of results to get
exlimit = " max " ,
prop = " extracts|info " , -- 'extracts' to get text, 'info' to get full page length
format = " json " ,
explaintext = " " ,
exintro = " " ,
-- We have to use 'exintro=' to get extracts for ALL results
-- (otherwise, we get the full text for only the first result, and
-- no text at all for the others
} ,
2017-01-21 18:23:13 +00:00
wiki_phtml_params = {
action = " parse " ,
format = " json " ,
-- we only need the following informations
prop = " text|sections|displaytitle|revid " ,
-- page = nil, -- text to lookup, will be added below
-- disabletoc = "", -- if we want to remove toc IN html
disablelimitreport = " " ,
disableeditsection = " " ,
} ,
-- allow for disabling prettifying full page text
wiki_prettify = G_reader_settings : nilOrTrue ( " wikipedia_prettify " ) ,
2014-08-20 06:41:45 +00:00
}
function Wikipedia : getWikiServer ( lang )
return string.format ( self.wiki_server , lang or self.default_lang )
end
2017-01-21 18:23:13 +00:00
-- Possible values for page_type parameter to loadPage()
local WIKIPEDIA_INTRO = 1
local WIKIPEDIA_FULL = 2
local WIKIPEDIA_PHTML = 3
2014-08-20 06:41:45 +00:00
--[[
-- return decoded JSON table from Wikipedia
--]]
2017-01-21 18:23:13 +00:00
function Wikipedia : loadPage ( text , lang , page_type , plain )
2014-09-25 14:19:36 +00:00
local socket = require ( ' socket ' )
local url = require ( ' socket.url ' )
local http = require ( ' socket.http ' )
local https = require ( ' ssl.https ' )
local ltn12 = require ( ' ltn12 ' )
2014-08-20 06:41:45 +00:00
local request , sink = { } , { }
local query = " "
2016-12-06 21:15:52 +00:00
2014-08-20 06:41:45 +00:00
local parsed = url.parse ( self : getWikiServer ( lang ) )
parsed.path = self.wiki_path
2017-01-21 18:23:13 +00:00
if page_type == WIKIPEDIA_INTRO then -- search query
2016-12-06 21:15:52 +00:00
self.wiki_search_params . explaintext = plain and " " or nil
for k , v in pairs ( self.wiki_search_params ) do
2017-01-21 18:23:13 +00:00
query = string.format ( " %s%s=%s& " , query , k , v )
2016-12-06 21:15:52 +00:00
end
parsed.query = query .. " gsrsearch= " .. url.escape ( text )
2017-01-21 18:23:13 +00:00
elseif page_type == WIKIPEDIA_FULL then -- full page content
2016-12-06 21:15:52 +00:00
self.wiki_params . explaintext = plain and " " or nil
for k , v in pairs ( self.wiki_params ) do
2017-01-21 18:23:13 +00:00
query = string.format ( " %s%s=%s& " , query , k , v )
2016-12-06 21:15:52 +00:00
end
parsed.query = query .. " titles= " .. url.escape ( text )
2017-01-21 18:23:13 +00:00
elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content
for k , v in pairs ( self.wiki_phtml_params ) do
query = string.format ( " %s%s=%s& " , query , k , v )
end
parsed.query = query .. " page= " .. url.escape ( text )
else
return
2016-12-06 21:15:52 +00:00
end
2014-08-20 06:41:45 +00:00
-- HTTP request
request [ ' url ' ] = url.build ( parsed )
request [ ' method ' ] = ' GET '
request [ ' sink ' ] = ltn12.sink . table ( sink )
http.TIMEOUT , https.TIMEOUT = 10 , 10
local httpRequest = parsed.scheme == ' http ' and http.request or https.request
2016-02-16 02:08:04 +00:00
-- first argument returned by skip is code
local _ , headers , status = socket.skip ( 1 , httpRequest ( request ) )
2014-08-20 06:41:45 +00:00
-- raise error message when network is unavailable
if headers == nil then
error ( " Network is unreachable " )
end
2015-08-23 18:39:08 +00:00
if status ~= " HTTP/1.1 200 OK " then
2016-12-29 08:10:38 +00:00
logger.warn ( " HTTP status not okay: " , status )
2015-08-23 18:39:08 +00:00
return
end
2014-08-20 06:41:45 +00:00
local content = table.concat ( sink )
2015-08-23 18:39:08 +00:00
if content ~= " " and string.sub ( content , 1 , 1 ) == " { " then
2015-03-21 03:39:25 +00:00
local ok , result = pcall ( JSON.decode , content )
2014-08-20 06:41:45 +00:00
if ok and result then
2016-12-29 08:10:38 +00:00
logger.dbg ( " wiki result " , result )
2014-08-20 06:41:45 +00:00
return result
else
2016-12-29 08:10:38 +00:00
logger.warn ( " wiki error: " , result )
2014-08-20 06:41:45 +00:00
end
2015-08-23 18:39:08 +00:00
else
2016-12-29 08:10:38 +00:00
logger.warn ( " not JSON from wiki response: " , content )
2014-08-20 06:41:45 +00:00
end
end
2016-12-06 21:15:52 +00:00
-- search wikipedia and get intros for results
2014-08-20 06:41:45 +00:00
function Wikipedia : wikintro ( text , lang )
2017-01-21 18:23:13 +00:00
local result = self : loadPage ( text , lang , WIKIPEDIA_INTRO , true )
2014-08-20 06:41:45 +00:00
if result then
local query = result.query
if query then
return query.pages
end
end
end
2016-12-06 21:15:52 +00:00
-- get full content of a wiki page
function Wikipedia : wikifull ( text , lang )
2017-01-21 18:23:13 +00:00
local result = self : loadPage ( text , lang , WIKIPEDIA_FULL , true )
2016-12-06 21:15:52 +00:00
if result then
local query = result.query
if query then
2017-01-21 18:23:13 +00:00
if self.wiki_prettify then
-- Prettification of the plain text full page
for pageid , page in pairs ( query.pages ) do
if page.extract then
page.extract = self : prettifyText ( page.extract )
end
end
end
2016-12-06 21:15:52 +00:00
return query.pages
end
end
end
2017-01-21 18:23:13 +00:00
-- get parsed html content and other infos of a wiki page
function Wikipedia : wikiphtml ( text , lang )
local result = self : loadPage ( text , lang , WIKIPEDIA_PHTML , true )
if result and result.parse then
return result.parse
end
if result.error and result.error . info then
error ( result.error . info )
end
end
-- UTF8 of unicode geometrical shapes we can use to replace
-- the "=== title ===" of wkipedia plaintext pages
-- These chosen ones are available in most fonts (prettier symbols
-- exist in unicode, but are available in a few fonts only) and
-- have a quite consistent size/weight in all fonts.
local th1_sym = " \xE2 \x96 \x88 " -- full block (big black rectangle) (never met, only for web page title?)
local th2_sym = " \xE2 \x96 \x89 " -- big black square
local th3_sym = " \xC2 \xA0 \xE2 \x97 \x86 " -- black diamond (indented, nicer)
local th4_sym = " \xE2 \x97 \xA4 " -- black upper left triangle
local th5_sym = " \xE2 \x9C \xBF " -- black florette
local th6_sym = " \xE2 \x9D \x96 " -- black diamond minus white x
-- Others available in most fonts
-- local thX_sym = "\xE2\x9C\x9A" -- heavy greek cross
-- local thX_sym = "\xE2\x97\xA2" -- black lower right triangle
-- local thX_sym = "\xE2\x97\x89" -- fish eye
-- local thX_sym = "\xE2\x96\x97" -- quadrant lower right
-- For optional prettification of the plain text full page
function Wikipedia : prettifyText ( text )
-- We use \a for an additional leading \n that we don't want shortened later
text = text : gsub ( " \n = " , " \n \a " .. th1_sym .. " " ) -- 2 empty lines before
text = text : gsub ( " \n == " , " \n \a " .. th2_sym .. " " ) -- 2 empty lines before
text = text : gsub ( " \n === " , " \n " .. th3_sym .. " " )
text = text : gsub ( " \n ==== " , " \n " .. th4_sym .. " " )
text = text : gsub ( " \n ===== " , " \n " .. th5_sym .. " " )
text = text : gsub ( " \n ====== " , " \n " .. th6_sym .. " " )
text = text : gsub ( " Modifier == " , " == " ) -- fr wikipedia fix for some articles modified by clumsy editors
text = text : gsub ( " ==$ " , " == \n " ) -- for a </hN> at end of text to be matched by next gsub
text = text : gsub ( " ===? \n + " , " \n \n " ) -- </h2> to </h3> : empty line after
text = text : gsub ( " ====+ \n + " , " \n " ) -- </h4> to </hN> : single \n, no empty line
text = text : gsub ( " \n \n + \xE2 \x80 \x94 " , " \n \xE2 \x80 \x94 " ) -- em dash, used for quote author, make it stick to prev text
text = text : gsub ( " \n + \n " , " \n " ) -- trim lines full of only spaces (often seen in math formulas)
text = text : gsub ( " ^ \n * " , " " ) -- trim new lines at start
text = text : gsub ( " \n *$ " , " " ) -- trim new lines at end
text = text : gsub ( " \n \n + " , " \n \n " ) -- shorten multiple new lines
text = text : gsub ( " \a " , " \n " ) -- re-add our wished \n
return text
end
local function getUrlContent ( url , timeout )
local socket = require ( ' socket ' )
local ltn12 = require ( ' ltn12 ' )
local requester
if url : sub ( 1 , 7 ) == " http:// " then
requester = require ( ' socket.http ' )
elseif url : sub ( 1 , 8 ) == " https:// " then
requester = require ( ' ssl.https ' )
else
return false , " Unsupported protocol "
end
requester.TIMEOUT = timeout or 10
local request = { }
local sink = { }
request [ ' url ' ] = url
request [ ' method ' ] = ' GET '
request [ ' sink ' ] = ltn12.sink . table ( sink )
-- first argument returned by skip is code
local _ , headers , status = socket.skip ( 1 , requester.request ( request ) )
if headers == nil then
logger.warn ( " No HTTP headers " )
return false , " Network unavailable "
end
if status ~= " HTTP/1.1 200 OK " then
logger.warn ( " HTTP status not okay: " , status )
return false , " Network unavailable "
end
return true , table.concat ( sink )
end
-- UTF8 of unicode geometrical shapes we'll prepend to wikipedia section headers,
-- to help identifying hierarchy (othewise, the small font size differences helps).
-- Best if identical to the ones used above for prettifying full plain text page.
-- These chosen ones are available in most fonts (prettier symbols
-- exist in unicode, but are available in a few fonts only) and
-- have a quite consistent size/weight in all fonts.
local h1_sym = " \xE2 \x96 \x88 " -- full block (big black rectangle) (never met, only for web page title?)
local h2_sym = " \xE2 \x96 \x89 " -- big black square
local h3_sym = " \xE2 \x97 \x86 " -- black diamond
local h4_sym = " \xE2 \x97 \xA4 " -- black upper left triangle
local h5_sym = " \xE2 \x9C \xBF " -- black florette
local h6_sym = " \xE2 \x9D \x96 " -- black diamond minus white x
-- Other available ones in most fonts
-- local hXsym = "\xE2\x9C\x9A" -- heavy greek cross
-- local hXsym = "\xE2\x97\xA2" -- black lower right triangle
-- local hXsym = "\xE2\x97\x89" -- fish eye
-- local hXsym = "\xE2\x96\x97" -- quadrant lower right
local ext_to_mimetype = {
png = " image/png " ,
jpg = " image/jpeg " ,
jpeg = " image/jpeg " ,
gif = " image/gif " ,
svg = " image/svg+xml " ,
html = " application/xhtml+xml " ,
xhtml = " application/xhtml+xml " ,
ncx = " application/x-dtbncx+xml " ,
js = " text/javascript " ,
css = " text/css " ,
otf = " application/opentype " ,
ttf = " application/truetype " ,
woff = " application/font-woff " ,
}
-- Create an epub file (with possibly images)
2017-09-19 19:17:09 +00:00
function Wikipedia : createEpub ( epub_path , page , lang , with_images )
-- Use Trapper to display progress and ask questions through the UI.
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
-- Trapper:info() and Trapper:confirm() will just use logger.
local UI = require ( " ui/trapper " )
2017-01-21 18:23:13 +00:00
2017-09-19 19:17:09 +00:00
UI : info ( _ ( " Fetching Wikipedia page… " ) )
2017-01-21 18:23:13 +00:00
local ok , phtml = pcall ( self.wikiphtml , self , page , lang )
if not ok then
2017-09-19 19:17:09 +00:00
UI : info ( phtml ) -- display error in InfoMessage
2017-01-21 18:23:13 +00:00
-- Sleep a bit to make that error seen
util.sleep ( 2 )
2017-09-19 19:17:09 +00:00
UI : reset ( )
2017-01-21 18:23:13 +00:00
return false
end
2017-08-07 11:19:08 +00:00
-- We may need to build absolute urls for non-absolute links and images urls
local wiki_base_url = self : getWikiServer ( lang )
2017-01-21 18:23:13 +00:00
-- Get infos from wikipedia result
-- (see example at https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection)
local cancelled = false
local html = phtml.text [ " * " ] -- html content
local page_cleaned = page : gsub ( " _ " , " " ) -- page title
local page_htmltitle = phtml.displaytitle -- page title with possible <sup> tags
local sections = phtml.sections -- Wikipedia provided TOC
local bookid = string.format ( " wikipedia_%s_%s_%s " , lang , phtml.pageid , phtml.revid )
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
-- should it changes if content is updated (as now, including the wikipedia revisionId),
-- or should it stays the same even if revid changes (content of the same book updated).
-- We need to find images in HTML to tell how many when asking user if they should be included
local images = { }
local seen_images = { }
local imagenum = 1
local cover_imgid = " " -- best candidate for cover among our images
local processImg = function ( img_tag )
local src = img_tag : match ( [[src="([^"]*)"]] )
if src == nil or src == " " then
logger.info ( " no src found in " , img_tag )
return nil
end
if src : sub ( 1 , 2 ) == " // " then
src = " https: " .. src -- Wikipedia redirects from http to https, so use https
2017-08-07 11:19:08 +00:00
elseif src : sub ( 1 , 1 ) == " / " then -- non absolute url
src = wiki_base_url .. src
2017-01-21 18:23:13 +00:00
end
local cur_image
if seen_images [ src ] then -- already seen
cur_image = seen_images [ src ]
else
local ext = src : match ( " .*%.(%S+) " )
if ext == nil or ext == " " then -- we won't know what mimetype to use, ignore it
logger.info ( " no file extension found in " , src )
return nil
end
ext = ext : lower ( )
local imgid = string.format ( " img%05d " , imagenum )
local imgpath = string.format ( " images/%s.%s " , imgid , ext )
local mimetype = ext_to_mimetype [ ext ] or " "
2017-09-17 13:48:47 +00:00
local width = tonumber ( img_tag : match ( [[width="([^"]*)"]] ) )
local height = tonumber ( img_tag : match ( [[height="([^"]*)"]] ) )
2017-01-21 18:23:13 +00:00
-- Get higher resolution (2x) image url
local src2x = nil
local srcset = img_tag : match ( [[srcset="([^"]*)"]] )
if srcset then
srcset = " " .. srcset .. " , " -- for next pattern to possibly match 1st or last item
src2x = srcset : match ( [[ (%S+) 2x, ]] )
2017-08-07 11:19:08 +00:00
if src2x then
if src2x : sub ( 1 , 2 ) == " // " then
src2x = " https: " .. src2x
elseif src2x : sub ( 1 , 1 ) == " / " then -- non absolute url
src2x = wiki_base_url .. src2x
end
2017-01-21 18:23:13 +00:00
end
end
cur_image = {
imgid = imgid ,
imgpath = imgpath ,
src = src ,
src2x = src2x ,
mimetype = mimetype ,
width = width ,
height = height ,
}
table.insert ( images , cur_image )
seen_images [ src ] = cur_image
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
2017-09-17 13:48:47 +00:00
if cover_imgid == " " and width and width > 50 and height and height > 50 and height > width then
2017-01-21 18:23:13 +00:00
cover_imgid = imgid
end
imagenum = imagenum + 1
end
-- crengine will NOT use width and height attributes, but it will use
-- those found in a style attribute.
-- If we get src2x images, crengine will scale them down to the 1x image size
-- (less space wasted by images while reading), but the 2x quality will be
-- there when image is viewed full screen with ImageViewer widget.
2017-09-17 13:48:47 +00:00
local style_props = { }
if cur_image.width then
table.insert ( style_props , string.format ( " width: %spx " , cur_image.width ) )
end
if cur_image.height then
table.insert ( style_props , string.format ( " height: %spx " , cur_image.height ) )
end
local style = table.concat ( style_props , " ; " )
return string.format ( [[<img src="%s" style="%s" alt=""/>]] , cur_image.imgpath , style )
2017-01-21 18:23:13 +00:00
end
html = html : gsub ( " (<%s*img [^>]*>) " , processImg )
logger.dbg ( " Images found in html: " , images )
-- See what to do with images
local include_images = false
local use_img_2x = false
if with_images then
2017-09-19 19:17:09 +00:00
-- If no UI (Trapper:wrap() not called), UI:confirm() will answer true
2017-01-21 18:23:13 +00:00
if # images > 0 then
2017-09-19 19:17:09 +00:00
include_images = UI : confirm ( T ( _ ( " The page contains %1 images. \n Would you like to download and include them in the generated EPUB file? " ) , # images ) , _ ( " Don't include " ) , _ ( " Include " ) )
2017-01-21 18:23:13 +00:00
if include_images then
2017-09-19 19:17:09 +00:00
use_img_2x = UI : confirm ( _ ( " Would you like to use slightly higher quality images? This will result in a bigger file size. " ) , _ ( " Standard quality " ) , _ ( " Higher quality " ) )
2017-01-21 18:23:13 +00:00
end
else
2017-09-19 19:17:09 +00:00
UI : info ( _ ( " The page does not contain any images. " ) )
2017-01-21 18:23:13 +00:00
util.sleep ( 1 ) -- Let the user see that
end
end
if not include_images then
-- Remove img tags to avoid little blank squares of missing images
html = html : gsub ( " <%s*img [^>]*> " , " " )
-- We could remove the whole image container <div class="thumb"...> ,
-- but it's a lot of nested <div> and not easy to do.
-- So the user will see the image legends and know a bit about
-- the images he chose to not get.
end
2017-09-19 19:17:09 +00:00
UI : info ( _ ( " Building EPUB… " ) )
2017-01-21 18:23:13 +00:00
-- Open the zip file (with .tmp for now, as crengine may still
-- have a handle to the final epub_path, and we don't want to
-- delete a good one if we fail/cancel later)
local epub_path_tmp = epub_path .. " .tmp "
local ZipWriter = require ( " ffi/zipwriter " )
local epub = ZipWriter : new { }
if not epub : open ( epub_path_tmp ) then
return false
end
-- We now create and add all the required epub files
-- ----------------------------------------------------------------
-- /mimetype : always "application/epub+zip"
epub : add ( " mimetype " , " application/epub+zip " )
-- ----------------------------------------------------------------
-- /META-INF/container.xml : always the same content
epub : add ( " META-INF/container.xml " , [ [
< ? xml version = " 1.0 " ? >
< container version = " 1.0 " xmlns = " urn:oasis:names:tc:opendocument:xmlns:container " >
< rootfiles >
< rootfile full - path = " OEBPS/content.opf " media - type = " application/oebps-package+xml " />
</ rootfiles >
</ container > ] ] )
-- ----------------------------------------------------------------
-- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory)
-- Other possible items in this file that are of no interest to crengine :
-- In <manifest> :
-- <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
-- <item id="cover-image" href="images/cover.png" media-type="image/png"/>
-- (crengine only uses <meta name="cover" content="cover-image" /> to get the cover image)
-- In <spine toc="ncx"> :
-- <itemref idref="cover" linear="no"/>
-- And a <guide> section :
-- <guide>
-- <reference href="title.html" type="cover" title="Cover"/>
-- <reference href="toc.html" type="toc" title="Table of Contents" href="toc.html" />
-- </guide>
local koreader_version = " KOReader "
if lfs.attributes ( " git-rev " , " mode " ) == " file " then
koreader_version = " KOReader " .. io.open ( " git-rev " , " r " ) : read ( )
end
local content_opf_parts = { }
-- head
table.insert ( content_opf_parts , string.format ( [ [
< ? xml version = ' 1.0 ' encoding = ' utf-8 ' ? >
< package xmlns = " http://www.idpf.org/2007/opf "
xmlns : dc = " http://purl.org/dc/elements/1.1/ "
unique - identifier = " bookid " version = " 2.0 " >
< metadata >
< dc : title >% s </ dc : title >
< dc : creator > Wikipedia % s </ dc : creator >
< dc : identifier id = " bookid " >% s </ dc : identifier >
< dc : language >% s </ dc : language >
< dc : publisher >% s </ dc : publisher >
< meta name = " cover " content = " %s " />
</ metadata >
< manifest >
< item id = " ncx " href = " toc.ncx " media - type = " application/x-dtbncx+xml " />
< item id = " content " href = " content.html " media - type = " application/xhtml+xml " />
< item id = " css " href = " stylesheet.css " media - type = " text/css " />
] ] , page_cleaned , lang : upper ( ) , bookid , lang , koreader_version , cover_imgid ) )
-- images files
if include_images then
for inum , img in ipairs ( images ) do
table.insert ( content_opf_parts , string.format ( [[ <item id="%s" href="%s" media-type="%s"/>%s]] , img.imgid , img.imgpath , img.mimetype , " \n " ) )
end
end
-- tail
table.insert ( content_opf_parts , [ [
</ manifest >
< spine toc = " ncx " >
< itemref idref = " content " />
</ spine >
</ package >
] ] )
epub : add ( " OEBPS/content.opf " , table.concat ( content_opf_parts ) )
-- ----------------------------------------------------------------
-- OEBPS/stylesheet.css
-- crengine will use its own data/epub.css, we just add/fix a few styles
-- to look more alike wikipedia web pages (that the user can ignore
-- with "Embedded Style" off)
epub : add ( " OEBPS/stylesheet.css " , [ [
/* make section headers looks left aligned and avoid some page breaks */
h1 , h2 {
text - align : left ;
}
h3 , h4 , h5 , h6 , h7 {
page - break - before : avoid ;
page - break - after : avoid ;
text - align : left ;
}
/* avoid page breaks around our centered titles on first page */
h1.koreaderwikifrontpage , h5.koreaderwikifrontpage {
page - break - before : avoid ;
page - break - inside : avoid ;
page - break - after : avoid ;
text - align : center ;
margin - top : 0 em ;
}
p.koreaderwikifrontpage {
font - style : italic ;
font - size : 90 % ;
margin - left : 2 em ;
margin - right : 2 em ;
margin - top : 1 em ;
margin - bottom : 1 em ;
}
hr.koreaderwikifrontpage {
margin - left : 20 % ;
margin - right : 20 % ;
margin - bottom : 1.2 em ;
}
/* So many links , make them look like normal text except for underline */
a {
display : inline ;
text - decoration : underline ;
color : black ,
font - weight : normal ;
}
/* No underline for links without their href that we removed */
a.newwikinonexistent {
text - decoration : none ;
}
/* show a box around image thumbnails */
div.thumb {
width : 80 % ;
border : dotted 1 px black ;
margin - top : 0.5 em ;
margin - bottom : 0.5 em ;
margin - left : 2.5 em ;
margin - right : 2.5 em ;
padding - top : ] ] .. ( include_images and " 0.5em " or " 0.15em " ) .. [ [ ;
padding - bottom : 0.2 em ;
padding - left : 0.5 em ;
padding - right : 0.5 em ;
text - align : center ;
font - size : 90 % ;
}
/* don ' t waste left margin for notes and list of pages */
ul , ol {
margin - left : 0 em ;
}
/* helps crengine to not display them as block elements */
time , abbr , sup {
display : inline ;
}
] ] )
-- ----------------------------------------------------------------
-- OEBPS/toc.ncx : table of content
local toc_ncx_parts = { }
local depth = 0
local cur_level = 0
local np_end = [[</navPoint>]]
local num = 1
-- Add our own first section for first page, with page name as title
table.insert ( toc_ncx_parts , string.format ( [[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html"/>]] , num , num , page_cleaned ) )
table.insert ( toc_ncx_parts , np_end )
-- Wikipedia sections items seem to be already sorted by index, so no need to sort
for isec , s in ipairs ( sections ) do
num = num + 1
local s_anchor = s.anchor
local s_title = string.format ( " %s %s " , s.number , s.line )
s_title = ( s_title : gsub ( " (%b<>) " , " " ) ) -- titles may include <i> and other html tags
local s_level = s.toclevel
if s_level > depth then
depth = s_level -- max depth required in toc.ncx
end
if s_level == cur_level then
table.insert ( toc_ncx_parts , np_end ) -- close same-level previous navPoint
elseif s_level < cur_level then
table.insert ( toc_ncx_parts , np_end ) -- close same-level previous navPoint
while s_level < cur_level do -- close all in-between navPoint
table.insert ( toc_ncx_parts , np_end )
cur_level = cur_level - 1
end
elseif s_level > cur_level + 1 then
-- a jump from level N to level N+2 or more ... should not happen
-- per epub spec, but we don't know about wikipedia...
-- so we create missing intermediate navPoints with same anchor as current section
while s_level > cur_level + 1 do
table.insert ( toc_ncx_parts , " \n " .. ( " " ) : rep ( cur_level ) )
table.insert ( toc_ncx_parts , string.format ( [[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>-</text></navLabel><content src="content.html#%s"/>]] , num , num , s_anchor ) )
cur_level = cur_level + 1
num = num + 1
end
-- elseif s_level == cur_level + 1 then
-- sublevel, nothing to close, nothing to add
end
cur_level = s_level
table.insert ( toc_ncx_parts , " \n " .. ( " " ) : rep ( cur_level ) ) -- indentation, in case a person looks at it
table.insert ( toc_ncx_parts , string.format ( [[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html#%s"/>]] , num , num , s_title , s_anchor ) )
end
-- close nested <navPoint>
while cur_level > 0 do
table.insert ( toc_ncx_parts , np_end )
cur_level = cur_level - 1
end
-- Prepend NCX head
table.insert ( toc_ncx_parts , 1 , string.format ( [ [
< ? xml version = ' 1.0 ' encoding = ' utf-8 ' ? >
< ! DOCTYPE ncx PUBLIC " -//NISO//DTD ncx 2005-1//EN " " http://www.daisy.org/z3986/2005/ncx-2005-1.dtd " >
< ncx xmlns = " http://www.daisy.org/z3986/2005/ncx/ " version = " 2005-1 " >
< head >
< meta name = " dtb:uid " content = " %s " />
< meta name = " dtb:depth " content = " %s " />
< meta name = " dtb:totalPageCount " content = " 0 " />
< meta name = " dtb:maxPageNumber " content = " 0 " />
</ head >
< docTitle >
< text >% s </ text >
</ docTitle >
< navMap >
] ] , bookid , depth , page_cleaned ) )
-- Append NCX tail
table.insert ( toc_ncx_parts , [ [
</ navMap >
</ ncx >
] ] )
epub : add ( " OEBPS/toc.ncx " , table.concat ( toc_ncx_parts ) )
-- ----------------------------------------------------------------
-- OEBPS/content.html
-- Some small fixes to Wikipedia HTML to make crengine and the user happier
-- Most images are in a link to the image info page, which is a useless
-- external link for us, so let's remove this link.
html = html : gsub ( " <a[^>]*>%s*(<%s*img [^>]*>)%s*</a> " , " %1 " )
-- For some <div class="thumb tright"> , which include nested divs, although
-- perfectly balanced, crengine seems to miss some closing </div> and we
-- end up having our image bordered box including the remaining main wiki text.
-- It looks like this code is supposed to deal with class= containing multiple
-- class names :
-- https://github.com/koreader/crengine/commit/0930ec7230e720c148fd6f231d69558832b4d53a
-- and that it may stumble on some cases.
-- It's all perfectly fine if we make all these div with a single class name
-- html = html:gsub([[<div class="thumb [^"]*">]], [[<div class="thumb">]])
--
-- But we may as well make all class= have a single name to avoid other problems
-- (no real risk with that, as we don't define any style for wikipedia class names,
-- except div.thumb that always appears first).
html = html : gsub ( [[(<[^>]* class="[^ "]+)%s+[^"]*"]] , [[%1"]] )
-- crengine seems to consider unknown tag as 'block' elements, so we may
-- want to remove or replace those that should be considered 'inline' elements
html = html : gsub ( " </?time[^>]*> " , " " )
-- Fix internal wikipedia links with full server url (including lang) so
-- ReaderLink can notice them and deal with them with a LookupWikipedia event.
-- html = html:gsub([[href="/wiki/]], [[href="]]..wiki_base_url..[[/wiki/]])
--
-- Also, crengine deals strangely with percent encoded utf8 :
-- if the link in the html is : <a href="http://fr.wikipedia.org/wiki/Fran%C3%A7oix">
2017-01-31 12:53:20 +00:00
-- we get from credocument:getLinkFromPosition() : http://fr.wikipedia.org/wiki/Fran____oix
2017-01-21 18:23:13 +00:00
-- These are bytes "\xc3\x83\xc2\xa7", that is U+C3 and U+A7 encoded as UTF8,
-- when we should have get "\xc3\xa7" ...
-- We can avoid that by putting in the url plain unencoded UTF8
local hex_to_char = function ( x ) return string.char ( tonumber ( x , 16 ) ) end
local fixEncodedWikiPageTitle = function ( wiki_page )
wiki_page = wiki_page : gsub ( " %%(%x%x) " , hex_to_char )
return string.format ( [[href="%s/wiki/%s"]] , wiki_base_url , wiki_page )
end
html = html : gsub ( [[href="/wiki/([^"]*)"]] , fixEncodedWikiPageTitle )
-- Remove href from links to non existant wiki page so they are not clickable :
2017-01-31 12:53:20 +00:00
-- <a href="/w/index.php?title=PageTitle&action=edit&redlink=1" class="new" title="PageTitle">PageTitle____on</a>
2017-01-21 18:23:13 +00:00
-- (removal of the href="" will make them non clickable)
html = html : gsub ( [[<a[^>]* class="new"[^>]*>]] , [[<a class="newwikinonexistent">]] )
-- Fix some other protocol-less links to wikipedia (href="//fr.wikipedia.org/w/index.php..)
html = html : gsub ( [[href="//]] , [[href="https://]] )
-- crengine does not return link if multiple class names in <a> (<a class="external text" href="">)
-- it would be no problem as we can't follow them, but when the user tap
-- on it, the tap is propagated to other widgets and page change happen...
-- html = html:gsub([[<a rel="nofollow" class="external text"]], [[<a rel="nofollow" class="externaltext"]])
-- html = html:gsub([[<a class="external text"]], [[<a class="externaltext"]])
-- Solved by our multiple class names suppression above
-- Avoid link being clickable before <a> (if it starts a line) or after </a> (if it
-- ends a line or a block) by wrapping it with U+200B ZERO WIDTH SPACE which will
-- make the DOM tree walking code to find a link stop at it.
-- html = html:gsub("(<[aA])", "\xE2\x80\x8B%1")
-- html = html:gsub("(</[aA]>)", "%1\xE2\x80\x8B")
-- Fixed in crengine lvtinydom.
if self.wiki_prettify then
-- Prepend some symbols to section titles for a better visual feeling of hierarchy
html = html : gsub ( " <h1> " , " <h1> " .. h1_sym .. " " )
html = html : gsub ( " <h2> " , " <h2> " .. h2_sym .. " " )
html = html : gsub ( " <h3> " , " <h3> " .. h3_sym .. " " )
html = html : gsub ( " <h4> " , " <h4> " .. h4_sym .. " " )
html = html : gsub ( " <h5> " , " <h5> " .. h5_sym .. " " )
html = html : gsub ( " <h6> " , " <h6> " .. h6_sym .. " " )
end
-- Note: in all the gsub patterns above, we used lowercase for tags and attributes
-- because it's how they are in wikipedia HTML and it makes the pattern simple.
-- If one day this changes, they'll have to be replaced with href => [Hh][Rr][Ee][Ff] ...
-- We can finally build the final HTML with some header of our own
local saved_on = T ( _ ( " Saved on %1 " ) , os.date ( " %b %d, %Y %H:%M:%S " ) )
local online_version_htmllink = string.format ( [[<a href="%s/wiki/%s">%s</a>]] , wiki_base_url , page : gsub ( " " , " _ " ) , _ ( " online version " ) )
local see_online_version = T ( _ ( " See %1 for up-to-date content " ) , online_version_htmllink )
epub : add ( " OEBPS/content.html " , string.format ( [ [
< html xmlns = " http://www.w3.org/1999/xhtml " >
< head >
< title >% s </ title >
< link type = " text/css " rel = " stylesheet " href = " stylesheet.css " />
</ head >
< body >
< h1 class = " koreaderwikifrontpage " >% s </ h1 >
< h5 class = " koreaderwikifrontpage " > Wikipedia % s </ h5 >
< p class = " koreaderwikifrontpage " >% s < br />% s </ p >
< hr class = " koreaderwikifrontpage " />
% s
</ body >
</ html >
] ] , page_cleaned , page_htmltitle , lang : upper ( ) , saved_on , see_online_version , html ) )
-- ----------------------------------------------------------------
-- OEBPS/images/*
if include_images then
local nb_images = # images
for inum , img in ipairs ( images ) do
2017-09-16 15:25:48 +00:00
-- Process can be interrupted at this point between each image download
-- by tapping while the InfoMessage is displayed
2017-09-19 19:17:09 +00:00
local go_on = UI : info ( T ( _ ( " Fetching image %1 / %2 … " ) , inum , nb_images ) )
2017-09-16 15:25:48 +00:00
if not go_on then
cancelled = true
break
end
2017-01-21 18:23:13 +00:00
local src = img.src
if use_img_2x and img.src2x then
src = img.src2x
end
logger.dbg ( " Getting img " , src )
local success , content = getUrlContent ( src )
-- success, content = getUrlContent(src..".unexistant") -- to simulate failure
if success then
logger.dbg ( " success, size: " , # content )
else
logger.info ( " failed fetching: " , src )
end
if success then
epub : add ( " OEBPS/ " .. img.imgpath , content )
else
2017-09-19 19:17:09 +00:00
go_on = UI : confirm ( T ( _ ( " Downloading image %1 failed. Continue anyway? " ) , inum ) , _ ( " Stop " ) , _ ( " Continue " ) )
2017-01-21 18:23:13 +00:00
if not go_on then
cancelled = true
break
end
end
end
end
-- Done with adding files
if cancelled then
2017-09-19 19:17:09 +00:00
if UI : confirm ( _ ( " Download did not complete. \n Do you want to create an EPUB with the already downloaded images? " ) , _ ( " Don't create " ) , _ ( " Create " ) ) then
2017-09-16 15:25:48 +00:00
cancelled = false
end
end
if cancelled then
2017-09-19 19:17:09 +00:00
UI : info ( _ ( " Canceled. Cleaning up… " ) )
2017-01-21 18:23:13 +00:00
else
2017-09-19 19:17:09 +00:00
UI : info ( _ ( " Packing EPUB… " ) )
2017-01-21 18:23:13 +00:00
end
epub : close ( )
-- This was nearly a no-op, so sleep a bit to make that progress step seen
util.usleep ( 300000 )
2017-09-19 19:17:09 +00:00
UI : reset ( ) -- close last InfoMessage
2017-01-21 18:23:13 +00:00
if cancelled then
-- Build was cancelled, remove half created .epub
if lfs.attributes ( epub_path_tmp , " mode " ) == " file " then
os.remove ( epub_path_tmp )
end
return false
end
-- Finally move the .tmp to the final file
os.rename ( epub_path_tmp , epub_path )
logger.info ( " successfully created: " , epub_path )
return true
end
2017-09-19 19:17:09 +00:00
-- Wrap Wikipedia:createEpub() with UI progress info, provided
-- by Trapper module.
2017-01-21 18:23:13 +00:00
function Wikipedia : createEpubWithUI ( epub_path , page , lang , result_callback )
2017-09-19 19:17:09 +00:00
-- To do any UI interaction while building the EPUB, we need
-- to use a coroutine, so that our code can be suspended while waiting
-- for user interaction, and resumed by UI widgets callbacks.
-- All this is hidden and done by Trapper with a simple API.
local Trapper = require ( " ui/trapper " )
Trapper : wrap ( function ( )
Trapper : setPausedText ( " Download paused " )
-- If errors in Wikipedia:createEpub(), the coroutine (used by
-- Trapper) would just abort (no reader crash, no error logged).
-- So we use pcall to catch any errors, log it, and report
-- the failure via result_callback.
local ok , success = pcall ( self.createEpub , self , epub_path , page , lang , true )
2017-01-21 18:23:13 +00:00
if ok and success then
result_callback ( true )
else
2017-09-19 19:17:09 +00:00
Trapper : reset ( ) -- close any last widget not cleaned if error
2017-01-21 18:23:13 +00:00
logger.warn ( " Wikipedia.createEpub pcall: " , ok , success )
result_callback ( false )
end
end )
end
2016-12-06 21:15:52 +00:00
2014-08-20 06:41:45 +00:00
return Wikipedia