diff --git a/frontend/apps/reader/modules/readerwikipedia.lua b/frontend/apps/reader/modules/readerwikipedia.lua index d42733d8c..ad2305281 100644 --- a/frontend/apps/reader/modules/readerwikipedia.lua +++ b/frontend/apps/reader/modules/readerwikipedia.lua @@ -6,6 +6,7 @@ local KeyValuePage = require("ui/widget/keyvaluepage") local LuaData = require("luadata") local NetworkMgr = require("ui/network/manager") local ReaderDictionary = require("apps/reader/modules/readerdictionary") +local Trapper = require("ui/trapper") local Translator = require("ui/translator") local UIManager = require("ui/uimanager") local Wikipedia = require("ui/wikipedia") @@ -21,7 +22,6 @@ local ReaderWikipedia = ReaderDictionary:extend{ -- identify itself is_wiki = true, wiki_languages = {}, - no_page = _("No wiki page found."), disable_history = G_reader_settings:isTrue("wikipedia_disable_history"), } @@ -271,7 +271,29 @@ function ReaderWikipedia:addToMainMenu(menu_items) end, }) end, - } + separator = true, + }, + { -- setting used in wikipedia.lua + text = _("Show image in search results"), + checked_func = function() + return G_reader_settings:nilOrTrue("wikipedia_show_image") + end, + callback = function() + G_reader_settings:flipNilOrTrue("wikipedia_show_image") + end, + }, + { -- setting used in wikipedia.lua + text = _("Show more images in full article"), + enabled_func = function() + return G_reader_settings:nilOrTrue("wikipedia_show_image") + end, + checked_func = function() + return G_reader_settings:nilOrTrue("wikipedia_show_more_images") and G_reader_settings:nilOrTrue("wikipedia_show_image") + end, + callback = function() + G_reader_settings:flipNilOrTrue("wikipedia_show_more_images") + end, + }, } } end @@ -319,6 +341,14 @@ function ReaderWikipedia:initLanguages(word) end function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage, forced_lang) + -- Wrapped through Trapper, as we may be using Trapper:dismissableRunInSubprocess() in it + Trapper:wrap(function() + self:lookupWikipedia(word, box, get_fullpage, forced_lang) + end) + return true +end + +function ReaderWikipedia:lookupWikipedia(word, box, get_fullpage, forced_lang) if not NetworkMgr:isOnline() then NetworkMgr:promptWifiOn() return @@ -358,19 +388,35 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage, forced_lang) }) end - -- Fix lookup message to include lang + -- Fix lookup message to include lang and set appropriate error texts + local no_result_text, req_failure_text if get_fullpage then - self.lookup_msg = T(_("Getting Wikipedia %2 page:\n%1"), "%1", lang:upper()) + self.lookup_msg = T(_("Retrieving Wikipedia %2 article:\n%1"), "%1", lang:upper()) + req_failure_text = _("Failed to retrieve Wikipedia article.") + no_result_text = _("Wikipedia article not found.") else self.lookup_msg = T(_("Searching Wikipedia %2 for:\n%1"), "%1", lang:upper()) + req_failure_text = _("Failed searching Wikipedia.") + no_result_text = _("No Wikipedia articles matching search term.") end self:showLookupInfo(display_word) + local results = {} local ok, pages + local lookup_cancelled = false + Wikipedia:setTrapWidget(self.lookup_progress_msg) if get_fullpage then - ok, pages = pcall(Wikipedia.wikifull, Wikipedia, word, lang) + ok, pages = pcall(Wikipedia.getFullPage, Wikipedia, word, lang) else - ok, pages = pcall(Wikipedia.wikintro, Wikipedia, word, lang) + ok, pages = pcall(Wikipedia.searchAndGetIntros, Wikipedia, word, lang) + end + Wikipedia:resetTrapWidget() + if not ok and pages and string.find(pages, Wikipedia.dismissed_error_code) then + -- So we can display an alternate dummy result + lookup_cancelled = true + -- Or we could just not show anything with: + -- self:dismissLookupInfo() + -- return end if ok and pages then -- sort pages according to 'index' attribute if present (not present @@ -387,14 +433,14 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage, forced_lang) pages = sorted_pages end for pageid, page in pairs(pages) do - local definition = page.extract or self.no_page + local definition = page.extract or no_result_text if page.length then -- we get 'length' only for intro results -- let's append it to definition so we know -- how big/valuable the full page is local fullkb = math.ceil(page.length/1024) local more_factor = math.ceil( page.length / (1+definition:len()) ) -- +1 just in case len()=0 - definition = definition .. "\n" .. T(_("(full page : %1 kB, = %2 x this intro length)"), fullkb, more_factor) + definition = definition .. "\n" .. T(_("(full article : %1 kB, = %2 x this intro length)"), fullkb, more_factor) end local result = { dict = T(_("Wikipedia %1"), lang:upper()), @@ -402,18 +448,27 @@ function ReaderWikipedia:onLookupWikipedia(word, box, get_fullpage, forced_lang) definition = definition, is_fullpage = get_fullpage, lang = lang, + images = page.images, } table.insert(results, result) end -- logger.dbg of results will be done by ReaderDictionary:showDict() else - logger.dbg("error:", pages) -- dummy results + local definition + if lookup_cancelled then + definition = _("Wikipedia request canceled.") + elseif ok then + definition = no_result_text + else + definition = req_failure_text + logger.dbg("error:", pages) + end results = { { dict = T(_("Wikipedia %1"), lang:upper()), word = word, - definition = self.no_page, + definition = definition, is_fullpage = get_fullpage, lang = lang, } diff --git a/frontend/ui/wikipedia.lua b/frontend/ui/wikipedia.lua index 09e0141bb..4d8253098 100644 --- a/frontend/ui/wikipedia.lua +++ b/frontend/ui/wikipedia.lua @@ -1,8 +1,10 @@ local JSON = require("json") +local Screen = require("device").screen +local ffiutil = require("ffi/util") local logger = require("logger") -local util = require("ffi/util") +local util = require("util") local _ = require("gettext") -local T = require("ffi/util").template +local T = ffiutil.template --[[ -- Query wikipedia using Wikimedia Web API. @@ -18,18 +20,9 @@ local T = require("ffi/util").template local Wikipedia = { wiki_server = "https://%s.wikipedia.org", wiki_path = "/w/api.php", - wiki_params = { - action = "query", - prop = "extracts", - format = "json", - -- exintro = nil, -- get more than only the intro - explaintext = "", - redirects = "", - -- title = nil, -- text to lookup, will be added below - }, default_lang = "en", - -- Search query for better results - -- see https://www.mediawiki.org/wiki/API:Main_page + -- See https://www.mediawiki.org/wiki/API:Main_page for details. + -- Search query, returns introductory texts (+ main thumbnail image) wiki_search_params = { action = "query", generator = "search", @@ -37,7 +30,7 @@ local Wikipedia = { -- gsrsearch = nil, -- text to lookup, will be added below gsrlimit = 20, -- max nb of results to get exlimit = "max", - prop = "extracts|info", -- 'extracts' to get text, 'info' to get full page length + prop = "extracts|info|pageimages", -- 'extracts' to get text, 'info' to get full page length format = "json", explaintext = "", exintro = "", @@ -45,6 +38,17 @@ local Wikipedia = { -- (otherwise, we get the full text for only the first result, and -- no text at all for the others }, + -- Full article, parsed to output text (+ main thumbnail image) + wiki_full_params = { + action = "query", + prop = "extracts|pageimages", + format = "json", + -- exintro = nil, -- get more than only the intro + explaintext = "", + redirects = "", + -- title = nil, -- text to lookup, will be added below + }, + -- Full article, parsed to output HTML, for Save as EPUB wiki_phtml_params = { action = "parse", format = "json", @@ -55,32 +59,152 @@ local Wikipedia = { disablelimitreport = "", disableeditsection = "", }, - -- allow for disabling prettifying full page text + -- Full article, parsed to output HTML, for images extraction + -- (used with full article as text, if "show more images" enabled) + wiki_images_params = { -- same as previous one, with just text html + action = "parse", + format = "json", + -- we only need the following informations + prop = "text", + -- page = nil, -- text to lookup, will be added below + redirects = "", + disabletoc = "", -- remove toc in html + disablelimitreport = "", + disableeditsection = "", + }, + -- There is an alternative for obtaining page's images: + -- prop=imageinfo&action=query&iiprop=url|dimensions|mime|extmetadata&generator=images&pageids=49448&iiurlwidth=100&iiextmetadatafilter=ImageDescription + -- but it gives all images (including wikipedia icons) in any order, without + -- any score or information that would help considering if they matter or not + -- + + -- Allow for disabling prettifying full page text wiki_prettify = G_reader_settings:nilOrTrue("wikipedia_prettify"), + + -- Can be set so HTTP requests will be done under Trapper and + -- be interruptible + trap_widget = nil, + -- For actions done with Trapper:dismissable methods, we may throw + -- and error() with this code. We make the value of this error + -- accessible here so that caller can know it's a user dismiss. + dismissed_error_code = "Interrupted by user", } function Wikipedia:getWikiServer(lang) return string.format(self.wiki_server, lang or self.default_lang) end +-- Codes that getUrlContent may get from requester.request() +local TIMEOUT_CODE = "timeout" -- from socket.lua +local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime + +-- Sink that stores into a table, aborting if maxtime has elapsed +local function sink_table_with_maxtime(t, maxtime) + -- Start counting as soon as this sink is created + local start_secs, start_usecs = ffiutil.gettime() + local starttime = start_secs + start_usecs/1000000 + t = t or {} + local f = function(chunk, err) + local secs, usecs = ffiutil.gettime() + if secs + usecs/1000000 - starttime > maxtime then + return nil, MAXTIME_CODE + end + if chunk then table.insert(t, chunk) end + return 1 + end + return f, t +end + +-- Get URL content +local function getUrlContent(url, timeout, maxtime) + local socket = require('socket') + local ltn12 = require('ltn12') + local http = require('socket.http') + local https = require('ssl.https') + + local requester + if url:sub(1,7) == "http://" then + requester = http + elseif url:sub(1,8) == "https://" then + requester = https + else + return false, "Unsupported protocol" + end + if not timeout then timeout = 10 end + -- timeout needs to be set to 'http', even if we use 'https' + http.TIMEOUT, https.TIMEOUT = timeout, timeout + + local request = {} + local sink = {} + request['url'] = url + request['method'] = 'GET' + -- 'timeout' delay works on socket, and is triggered when + -- that time has passed trying to connect, or after connection + -- when no data has been read for this time. + -- On a slow connection, it may not be triggered (as we could read + -- 1 byte every 1 second, not triggering any timeout). + -- 'maxtime' can be provided to overcome that, and we start counting + -- as soon as the first content byte is received (but it is checked + -- for only when data is received). + -- Setting 'maxtime' and 'timeout' gives more chance to abort the request when + -- it takes too much time (in the worst case: in timeout+maxtime seconds). + -- But time taken by DNS lookup cannot easily be accounted for, so + -- a request may (when dns lookup takes time) exceed timeout and maxtime... + if maxtime then + request['sink'] = sink_table_with_maxtime(sink, maxtime) + else + request['sink'] = ltn12.sink.table(sink) + end + + local code, headers, status = socket.skip(1, requester.request(request)) + local content = table.concat(sink) -- empty or content accumulated till now + -- logger.dbg("code:", code) + -- logger.dbg("headers:", headers) + -- logger.dbg("status:", status) + -- logger.dbg("#content:", #content) + + if code == TIMEOUT_CODE or code == MAXTIME_CODE then + logger.warn("request interrupted:", code) + return false, code + end + if headers == nil then + logger.warn("No HTTP headers:", code, status) + return false, "Network or remote server unavailable" + end + if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK + logger.warn("HTTP status not okay:", code, status) + return false, "Remote server error or unavailable" + end + if headers and headers["content-length"] then + -- Check we really got the announced content size + local content_length = tonumber(headers["content-length"]) + if #content ~= content_length then + return false, "Incomplete content received" + end + end + return true, content +end + +function Wikipedia:setTrapWidget(trap_widget) + self.trap_widget = trap_widget +end + +function Wikipedia:resetTrapWidget() + self.trap_widget = nil +end + -- Possible values for page_type parameter to loadPage() local WIKIPEDIA_INTRO = 1 local WIKIPEDIA_FULL = 2 local WIKIPEDIA_PHTML = 3 +local WIKIPEDIA_IMAGES = 4 --[[ -- return decoded JSON table from Wikipedia --]] function Wikipedia:loadPage(text, lang, page_type, plain) - local socket = require('socket') local url = require('socket.url') - local http = require('socket.http') - local https = require('ssl.https') - local ltn12 = require('ltn12') - - local request, sink = {}, {} local query = "" - local parsed = url.parse(self:getWikiServer(lang)) parsed.path = self.wiki_path if page_type == WIKIPEDIA_INTRO then -- search query @@ -90,8 +214,8 @@ function Wikipedia:loadPage(text, lang, page_type, plain) end parsed.query = query .. "gsrsearch=" .. url.escape(text) elseif page_type == WIKIPEDIA_FULL then -- full page content - self.wiki_params.explaintext = plain and "" or nil - for k,v in pairs(self.wiki_params) do + self.wiki_full_params.explaintext = plain and "" or nil + for k,v in pairs(self.wiki_full_params) do query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "titles=" .. url.escape(text) @@ -100,66 +224,90 @@ function Wikipedia:loadPage(text, lang, page_type, plain) query = string.format("%s%s=%s&", query, k, v) end parsed.query = query .. "page=" .. url.escape(text) + elseif page_type == WIKIPEDIA_IMAGES then -- images found in page html + for k,v in pairs(self.wiki_images_params) do + query = string.format("%s%s=%s&", query, k, v) + end + parsed.query = query .. "page=" .. url.escape(text) else return end - -- HTTP request - request['url'] = url.build(parsed) - request['method'] = 'GET' - request['sink'] = ltn12.sink.table(sink) - http.TIMEOUT, https.TIMEOUT = 10, 10 - local httpRequest = parsed.scheme == 'http' and http.request or https.request - -- first argument returned by skip is code - local _, headers, status = socket.skip(1, httpRequest(request)) - - -- raise error message when network is unavailable - if headers == nil then - error("Network is unreachable") + local built_url = url.build(parsed) + local completed, success, content + if self.trap_widget then -- if previously set with Wikipedia:setTrapWidget() + local Trapper = require("ui/trapper") + local timeout, maxtime = 30, 60 + -- We use dismissableRunInSubprocess with complex return values: + completed, success, content = Trapper:dismissableRunInSubprocess(function() + return getUrlContent(built_url, timeout, maxtime) + end, self.trap_widget) + if not completed then + error(self.dismissed_error_code) -- "Interrupted by user" + end + else + local timeout, maxtime = 10, 60 + success, content = getUrlContent(built_url, timeout, maxtime) end - - if status ~= "HTTP/1.1 200 OK" then - logger.warn("HTTP status not okay:", status) - return + if not success then + error(content) end - local content = table.concat(sink) if content ~= "" and string.sub(content, 1,1) == "{" then local ok, result = pcall(JSON.decode, content) if ok and result then - logger.dbg("wiki result", result) + logger.dbg("wiki result json:", result) return result else - logger.warn("wiki error:", result) + logger.warn("wiki result json decoding error:", result) + error("Failed decoding JSON") end else - logger.warn("not JSON from wiki response:", content) + logger.warn("wiki response is not json:", content) + error("Response is not JSON") end end -- search wikipedia and get intros for results -function Wikipedia:wikintro(text, lang) +function Wikipedia:searchAndGetIntros(text, lang) local result = self:loadPage(text, lang, WIKIPEDIA_INTRO, true) if result then local query = result.query if query then + local show_image = G_reader_settings:nilOrTrue("wikipedia_show_image") + -- Scale wikipedia normalized (we hope) thumbnail by 2 (adjusted + -- to screen size/dpi) for intros (and x8 more for highres image) + local image_size_factor = Screen:scaleBySize(200)/100.0 + if show_image then + for pageid, page in pairs(query.pages) do + self:addImages(page, lang, false, image_size_factor, 8) + end + end return query.pages end end end -- get full content of a wiki page -function Wikipedia:wikifull(text, lang) - local result = self:loadPage(text, lang, WIKIPEDIA_FULL, true) +function Wikipedia:getFullPage(wiki_title, lang) + local result = self:loadPage(wiki_title, lang, WIKIPEDIA_FULL, true) if result then local query = result.query if query then - if self.wiki_prettify then - -- Prettification of the plain text full page + local show_image = G_reader_settings:nilOrTrue("wikipedia_show_image") + local show_more_images = G_reader_settings:nilOrTrue("wikipedia_show_more_images") + -- Scale wikipedia normalized (we hope) thumbnails by 4 (adjusted + -- to screen size/dpi) for full page (and this *4 for highres image) + local image_size_factor = Screen:scaleBySize(400)/100.0 + if self.wiki_prettify or show_image then for pageid, page in pairs(query.pages) do - if page.extract then + if self.wiki_prettify and page.extract then + -- Prettification of the plain text full page page.extract = self:prettifyText(page.extract) end + if show_image then + self:addImages(page, lang, show_more_images, image_size_factor, 4) + end end end return query.pages @@ -168,8 +316,8 @@ function Wikipedia:wikifull(text, lang) end -- get parsed html content and other infos of a wiki page -function Wikipedia:wikiphtml(text, lang) - local result = self:loadPage(text, lang, WIKIPEDIA_PHTML, true) +function Wikipedia:getFullPageHtml(wiki_title, lang) + local result = self:loadPage(wiki_title, lang, WIKIPEDIA_PHTML, true) if result and result.parse then return result.parse end @@ -178,6 +326,247 @@ function Wikipedia:wikiphtml(text, lang) end end +-- get images extracted from parsed html +function Wikipedia:getFullPageImages(wiki_title, lang) + local images = {} -- will be returned, each in a format similar to page.thumbnail + local result = self:loadPage(wiki_title, lang, WIKIPEDIA_IMAGES, true) + if result and result.parse and result.parse.text and result.parse.text["*"] then + local html = result.parse.text["*"] -- html content + local url = require('socket.url') + local wiki_base_url = self:getWikiServer(lang) + + local thumbs = {} -- bits of HTML containing an image + -- We first try to catch images in