local JSON = require("json")
local RenderImage = require("ui/renderimage")
local Screen = require("device").screen
local ffiutil = require("ffi/util")
local logger = require("logger")
local util = require("util")
local _ = require("gettext")
local T = ffiutil.template
--[[
-- Query wikipedia using Wikimedia Web API.
-- https://en.wikipedia.org/w/api.php?format=jsonfm&action=query&generator=search&gsrnamespace=0&gsrsearch=ereader&gsrlimit=10&prop=extracts&exintro&explaintext&exlimit=max
-- https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=jsonfm&explaintext=&redirects=&titles=E-reader
--
-- To get parsed HTML :
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book
-- https://en.wikipedia.org/w/api.php?action=parse&page=E-book&prop=text|sections|displaytitle|revid&disablelimitreport=&disableeditsection
-- https://www.mediawiki.org/wiki/API:Parsing_wikitext#parse
--]]
local Wikipedia = {
wiki_server = "https://%s.wikipedia.org",
wiki_path = "/w/api.php",
default_lang = "en",
-- See https://www.mediawiki.org/wiki/API:Main_page for details.
-- Search query, returns introductory texts (+ main thumbnail image)
wiki_search_params = {
action = "query",
generator = "search",
gsrnamespace = "0",
-- gsrsearch = nil, -- text to lookup, will be added below
gsrlimit = 20, -- max nb of results to get
exlimit = "max",
prop = "extracts|info|pageimages", -- 'extracts' to get text, 'info' to get full page length
format = "json",
explaintext = "",
exintro = "",
-- We have to use 'exintro=' to get extracts for ALL results
-- (otherwise, we get the full text for only the first result, and
-- no text at all for the others
},
-- Full article, parsed to output text (+ main thumbnail image)
wiki_full_params = {
action = "query",
prop = "extracts|pageimages",
format = "json",
-- exintro = nil, -- get more than only the intro
explaintext = "",
redirects = "",
-- title = nil, -- text to lookup, will be added below
},
-- Full article, parsed to output HTML, for Save as EPUB
wiki_phtml_params = {
action = "parse",
format = "json",
-- we only need the following informations
prop = "text|sections|displaytitle|revid",
-- page = nil, -- text to lookup, will be added below
-- disabletoc = "", -- if we want to remove toc IN html
disablelimitreport = "",
disableeditsection = "",
},
-- Full article, parsed to output HTML, for images extraction
-- (used with full article as text, if "show more images" enabled)
wiki_images_params = { -- same as previous one, with just text html
action = "parse",
format = "json",
-- we only need the following informations
prop = "text",
-- page = nil, -- text to lookup, will be added below
redirects = "",
disabletoc = "", -- remove toc in html
disablelimitreport = "",
disableeditsection = "",
},
-- There is an alternative for obtaining page's images:
-- prop=imageinfo&action=query&iiprop=url|dimensions|mime|extmetadata&generator=images&pageids=49448&iiurlwidth=100&iiextmetadatafilter=ImageDescription
-- but it gives all images (including wikipedia icons) in any order, without
-- any score or information that would help considering if they matter or not
--
-- Allow for disabling prettifying full page text
wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"),
-- Can be set so HTTP requests will be done under Trapper and
-- be interruptible
trap_widget = nil,
-- For actions done with Trapper:dismissable methods, we may throw
-- and error() with this code. We make the value of this error
-- accessible here so that caller can know it's a user dismiss.
dismissed_error_code = "Interrupted by user",
}
function Wikipedia:getWikiServer(lang)
return string.format(self.wiki_server, lang or self.default_lang)
end
-- Say who we are to Wikipedia (see https://meta.wikimedia.org/wiki/User-Agent_policy)
local USER_AGENT = T("KOReader/%1 (https://koreader.rocks/) %2",
(lfs.attributes("git-rev", "mode") == "file" and io.open("git-rev", "r"):read() or "devel"),
require('socket.http').USERAGENT:gsub(" ", "/") )
-- Codes that getUrlContent may get from requester.request()
local TIMEOUT_CODE = "timeout" -- from socket.lua
local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime
-- Sink that stores into a table, aborting if maxtime has elapsed
local function sink_table_with_maxtime(t, maxtime)
-- Start counting as soon as this sink is created
local start_secs, start_usecs = ffiutil.gettime()
local starttime = start_secs + start_usecs/1000000
t = t or {}
local f = function(chunk, err)
local secs, usecs = ffiutil.gettime()
if secs + usecs/1000000 - starttime > maxtime then
return nil, MAXTIME_CODE
end
if chunk then table.insert(t, chunk) end
return 1
end
return f, t
end
-- Get URL content
local function getUrlContent(url, timeout, maxtime)
local socket = require('socket')
local ltn12 = require('ltn12')
local http = require('socket.http')
local https = require('ssl.https')
local requester
if url:sub(1,7) == "http://" then
requester = http
elseif url:sub(1,8) == "https://" then
requester = https
else
return false, "Unsupported protocol"
end
if not timeout then timeout = 10 end
-- timeout needs to be set to 'http', even if we use 'https'
http.TIMEOUT, https.TIMEOUT = timeout, timeout
local request = {}
local sink = {}
request['url'] = url
request['method'] = 'GET'
request['headers'] = {
["User-Agent"] = USER_AGENT,
}
-- 'timeout' delay works on socket, and is triggered when
-- that time has passed trying to connect, or after connection
-- when no data has been read for this time.
-- On a slow connection, it may not be triggered (as we could read
-- 1 byte every 1 second, not triggering any timeout).
-- 'maxtime' can be provided to overcome that, and we start counting
-- as soon as the first content byte is received (but it is checked
-- for only when data is received).
-- Setting 'maxtime' and 'timeout' gives more chance to abort the request when
-- it takes too much time (in the worst case: in timeout+maxtime seconds).
-- But time taken by DNS lookup cannot easily be accounted for, so
-- a request may (when dns lookup takes time) exceed timeout and maxtime...
if maxtime then
request['sink'] = sink_table_with_maxtime(sink, maxtime)
else
request['sink'] = ltn12.sink.table(sink)
end
local code, headers, status = socket.skip(1, requester.request(request))
local content = table.concat(sink) -- empty or content accumulated till now
-- logger.dbg("code:", code)
-- logger.dbg("headers:", headers)
-- logger.dbg("status:", status)
-- logger.dbg("#content:", #content)
if code == TIMEOUT_CODE or code == MAXTIME_CODE then
logger.warn("request interrupted:", code)
return false, code
end
if headers == nil then
logger.warn("No HTTP headers:", code, status)
return false, "Network or remote server unavailable"
end
if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK
logger.warn("HTTP status not okay:", code, status)
return false, "Remote server error or unavailable"
end
if headers and headers["content-length"] then
-- Check we really got the announced content size
local content_length = tonumber(headers["content-length"])
if #content ~= content_length then
return false, "Incomplete content received"
end
end
return true, content
end
function Wikipedia:setTrapWidget(trap_widget)
self.trap_widget = trap_widget
end
function Wikipedia:resetTrapWidget()
self.trap_widget = nil
end
-- Possible values for page_type parameter to loadPage()
local WIKIPEDIA_INTRO = 1
local WIKIPEDIA_FULL = 2
local WIKIPEDIA_PHTML = 3
local WIKIPEDIA_IMAGES = 4
--[[
-- return decoded JSON table from Wikipedia
--]]
function Wikipedia:loadPage(text, lang, page_type, plain)
local url = require('socket.url')
local query = ""
local parsed = url.parse(self:getWikiServer(lang))
parsed.path = self.wiki_path
if page_type == WIKIPEDIA_INTRO then -- search query
self.wiki_search_params.explaintext = plain and "" or nil
for k,v in pairs(self.wiki_search_params) do
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "gsrsearch=" .. url.escape(text)
elseif page_type == WIKIPEDIA_FULL then -- full page content
self.wiki_full_params.explaintext = plain and "" or nil
for k,v in pairs(self.wiki_full_params) do
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "titles=" .. url.escape(text)
elseif page_type == WIKIPEDIA_PHTML then -- parsed html page content
for k,v in pairs(self.wiki_phtml_params) do
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "page=" .. url.escape(text)
elseif page_type == WIKIPEDIA_IMAGES then -- images found in page html
for k,v in pairs(self.wiki_images_params) do
query = string.format("%s%s=%s&", query, k, v)
end
parsed.query = query .. "page=" .. url.escape(text)
else
return
end
local built_url = url.build(parsed)
local completed, success, content
if self.trap_widget then -- if previously set with Wikipedia:setTrapWidget()
local Trapper = require("ui/trapper")
local timeout, maxtime = 30, 60
-- We use dismissableRunInSubprocess with complex return values:
completed, success, content = Trapper:dismissableRunInSubprocess(function()
return getUrlContent(built_url, timeout, maxtime)
end, self.trap_widget)
if not completed then
error(self.dismissed_error_code) -- "Interrupted by user"
end
else
-- Smaller timeout than when we have a trap_widget because we are
-- blocking without one (but 20s may be needed to fetch the main HTML
-- page of big articles when making an EPUB).
local timeout, maxtime = 20, 60
success, content = getUrlContent(built_url, timeout, maxtime)
end
if not success then
error(content)
end
if content ~= "" and string.sub(content, 1,1) == "{" then
local ok, result = pcall(JSON.decode, content)
if ok and result then
logger.dbg("wiki result json:", result)
return result
else
logger.warn("wiki result json decoding error:", result)
error("Failed decoding JSON")
end
else
logger.warn("wiki response is not json:", content)
error("Response is not JSON")
end
end
-- search wikipedia and get intros for results
function Wikipedia:searchAndGetIntros(text, lang)
local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true)
if result then
local query = result.query
if query then
local show_image = G_reader_settings:nilOrTrue("wikipedia_show_image")
-- Scale wikipedia normalized (we hope) thumbnail by 2 (adjusted
-- to screen size/dpi) for intros (and x8 more for highres image)
local image_size_factor = Screen:scaleBySize(200)/100.0
if show_image then
for pageid, page in pairs(query.pages) do
self:addImages(page, lang, false, image_size_factor, 8)
end
end
return query.pages
end
end
end
-- get full content of a wiki page
function Wikipedia:getFullPage(wiki_title, lang)
local result = self:loadPage(wiki_title, lang, WIKIPEDIA_FULL, true)
if result then
local query = result.query
if query then
local show_image = G_reader_settings:nilOrTrue("wikipedia_show_image")
local show_more_images = G_reader_settings:nilOrTrue("wikipedia_show_more_images")
-- Scale wikipedia normalized (we hope) thumbnails by 4 (adjusted
-- to screen size/dpi) for full page (and this *4 for highres image)
local image_size_factor = Screen:scaleBySize(400)/100.0
if self.wiki_prettify or show_image then
for pageid, page in pairs(query.pages) do
if self.wiki_prettify and page.extract then
-- Prettification of the plain text full page
page.extract = self:prettifyText(page.extract)
end
if show_image then
self:addImages(page, lang, show_more_images, image_size_factor, 4)
end
end
end
return query.pages
end
end
end
-- get parsed html content and other infos of a wiki page
function Wikipedia:getFullPageHtml(wiki_title, lang)
local result = self:loadPage(wiki_title, lang, WIKIPEDIA_PHTML, true)
if result and result.parse then
return result.parse
end
if result.error and result.error.info then
error(result.error.info)
end
end
-- get images extracted from parsed html
function Wikipedia:getFullPageImages(wiki_title, lang)
local images = {} -- will be returned, each in a format similar to page.thumbnail
local result = self:loadPage(wiki_title, lang, WIKIPEDIA_IMAGES, true)
if result and result.parse and result.parse.text and result.parse.text["*"] then
local html = result.parse.text["*"] -- html content
local url = require('socket.url')
local wiki_base_url = self:getWikiServer(lang)
local thumbs = {} -- bits of HTML containing an image
-- We first try to catch images in
, which should exclude
-- wikipedia icons, flags... These seem to all end with a double
.
for thtml in html:gmatch([[
%s*
]]) do
table.insert(thumbs, thtml)
end
-- We then also try to catch images in galleries (which often are less
-- interesting than those in thumbinner) as a 2nd set.
for thtml in html:gmatch([[
(crengine displays them one above the other) and can be displayed
-- side by side
-- For some
, which include nested divs, although
-- perfectly balanced, crengine seems to miss some closing
and we
-- end up having our image bordered box including the remaining main wiki text.
-- It looks like this code is supposed to deal with class= containing multiple
-- class names :
-- https://github.com/koreader/crengine/commit/0930ec7230e720c148fd6f231d69558832b4d53a
-- and that it may stumble on some cases.
-- It's all perfectly fine if we make all these div with a single class name
-- html = html:gsub([[
]], [[
]])
--
-- But we may as well make all class= have a single name to avoid other problems
-- (no real risk with that, as we don't define any style for wikipedia class names,
-- except div.thumb that always appears first).
html = html:gsub([[(<[^>]* class="[^ "]+)%s+[^"]*"]], [[%1"]])
-- crengine seems to consider unknown tag as 'block' elements, so we may
-- want to remove or replace those that should be considered 'inline' elements
html = html:gsub("?time[^>]*>", "")
-- crengine does not support the