2
0
mirror of https://github.com/koreader/koreader synced 2024-10-31 21:20:20 +00:00
koreader/frontend/apps/reader/modules/readerdictionary.lua
poire-z 1708fd5e1c Dictionary and wikipedia enhancements (#2393)
Stardict:
- remove duplicate results
- better cleaning of selection
- append results from a 2nd query of a 2nd set of dictionaries
  in data/dict_ext/

Wikipedia:
- use the search API for better results
- allow viewing the full page content of a result in a bigger window
- allow queries for multiple languages
- available languages can be set in settings.reader.lua :
    ["wikipedia_languages"] = {"en", "fr", "it"}
- "Wikipedia lookup" added to Tools menu

For both:
- allow selection of multiple words for a new lookup (so one can
  actually browse wikipedia)
- allow continuous reading with Tap
- display "current result / total number of results"

Details in #2393
2016-12-11 14:32:02 -08:00

240 lines
8.7 KiB
Lua
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

local InputContainer = require("ui/widget/container/inputcontainer")
local DictQuickLookup = require("ui/widget/dictquicklookup")
local InfoMessage = require("ui/widget/infomessage")
local DataStorage = require("datastorage")
local UIManager = require("ui/uimanager")
local Screen = require("device").screen
local Device = require("device")
local JSON = require("json")
local DEBUG = require("dbg")
local util = require("util")
local _ = require("gettext")
local T = require("ffi/util").template
local ReaderDictionary = InputContainer:new{
data_dir = nil,
dict_window_list = {},
lookup_msg = _("Searching dictionary for:\n%1")
}
function ReaderDictionary:init()
self.ui.menu:registerToMainMenu(self)
self.data_dir = os.getenv("STARDICT_DATA_DIR") or
DataStorage:getDataDir() .. "/data/dict"
end
function ReaderDictionary:addToMainMenu(tab_item_table)
table.insert(tab_item_table.plugins, {
text = _("Dictionary lookup"),
tap_input = {
title = _("Enter a word to look up"),
type = "text",
callback = function(input)
self:onLookupWord(input)
end,
},
})
end
function ReaderDictionary:onLookupWord(word, box, highlight)
self.highlight = highlight
self:stardictLookup(word, box)
return true
end
local function tidyMarkup(results)
local cdata_tag = "<!%[CDATA%[(.-)%]%]>"
local format_escape = "&[29Ib%+]{(.-)}"
for _, result in ipairs(results) do
local def = result.definition
-- preserve the <br> tag for line break
def = def:gsub("<[bB][rR] ?/?>", "\n")
-- parse CDATA text in XML
if def:find(cdata_tag) then
def = def:gsub(cdata_tag, "%1")
-- ignore format strings
while def:find(format_escape) do
def = def:gsub(format_escape, "%1")
end
end
-- ignore all markup tags
def = def:gsub("%b<>", "")
-- strip all leading empty lines/spaces
def = def:gsub("^%s+", "")
result.definition = def
end
return results
end
function ReaderDictionary:cleanSelection(text)
-- Will be used by ReaderWikipedia too
if not text then
return ""
end
-- We do multiple times the same replacements, which is most of the time overkill,
-- but sometimes provices better cleaning
-- Some extremes cases to explain the multiples gsub :
--
-- Sample epub html: mais, qu« absolument, on ne peut décidément pas » revenir en arrière
-- Holding on "qu" or "absolument" will make crengine returns: qu« absolument,
-- We want to only get: absolument
--
-- Sample epub html: car « létat, actuel, de notre connaissance » sy oppose
-- Holding on "état" will make crengine returns: « létat,
-- We want to get: état
--
-- Some of these gsub could be removed when crengine does a better job
-- at finding word boundaries
--
-- Strip some quotations marks
text = string.gsub(text, "\xC2\xAB", '') -- U+00AB << (left double angle quotation mark)
text = string.gsub(text, "\xC2\xBB", '') -- U+00BB >> (right double angle quotation mark)
text = string.gsub(text, "\xE2\x80\x9D", '') -- U+201D '' (right double quotation mark)
text = string.gsub(text, "\xE2\x80\x9C", '') -- U+201C `` (left double quotation mark)
text = string.gsub(text, "\xE2\x80\x94", '') -- U+2014 - (em dash)
text = string.gsub(text, "\xE2\x80\x95", '') -- U+2015 - (horizontal bar)
text = string.gsub(text, "\xC2\xA0", '') -- U+00A0 no-break space
-- Replace some meaningful quotes with ascii quote
text = string.gsub(text, "\xE2\x80\x99", "'") -- U+2019 (right single quotation mark)
-- Strip punctuation characters around selection
-- (this had to be done after the utf8 gsubs above, or it would strip part of these utf8 chars)
text = util.stripePunctuations(text)
-- Strip leading and trailing spaces
text = string.gsub(text, "^%s+", '')
text = string.gsub(text, "%s+$", '')
-- Strip some french grammatical constructs
text = string.gsub(text, "^[LSDMNTlsdmnt]'", '') -- french l' s' t'
text = string.gsub(text, "^[Qq][Uu]'", '') -- french qu'
-- Strip again leading and trailing spaces
text = string.gsub(text, "^%s+", '')
text = string.gsub(text, "%s+$", '')
return text
end
function ReaderDictionary:onLookupStarted(word)
local text = T(self.lookup_msg, word)
self.lookup_progress_msg = InfoMessage:new{text=text}
UIManager:show(self.lookup_progress_msg)
UIManager:forceRePaint()
end
function ReaderDictionary:onLookupDone()
if self.lookup_progress_msg then
UIManager:close(self.lookup_progress_msg)
UIManager:forceRePaint()
end
self.lookup_progress_msg = nil
end
function ReaderDictionary:stardictLookup(word, box)
DEBUG("lookup word:", word, box)
-- escape quotes and other funny characters in word
word = self:cleanSelection(word)
DEBUG("stripped word:", word)
if word == "" then
return
end
self:onLookupStarted(word)
local final_results = {}
local seen_results = {}
-- Allow for two sdcv calls : one in the classic data/dict, and
-- another one in data/dict_ext if it exists
-- We could put in data/dict_ext dictionaries with a great number of words
-- but poor definitions as a fall back. If these were in data/dict,
-- they would prevent fuzzy searches in other dictories with better
-- definitions, and masks such results. This way, we can get both.
local dict_dirs = {self.data_dir}
local dict_ext = self.data_dir.."_ext"
if lfs.attributes(dict_ext, "mode") == "directory" then
table.insert(dict_dirs, dict_ext)
end
for _, dict_dir in ipairs(dict_dirs) do
local results_str = nil
if Device:isAndroid() then
local A = require("android")
results_str = A.stdout("./sdcv", "--utf8-input", "--utf8-output",
"-nj", word, "--data-dir", dict_dir)
else
local std_out = io.popen("./sdcv --utf8-input --utf8-output -nj "
.. ("%q"):format(word) .. " --data-dir " .. dict_dir, "r")
if std_out then
results_str = std_out:read("*all")
std_out:close()
end
end
--DEBUG("result str:", word, results_str)
local ok, results = pcall(JSON.decode, results_str)
if ok and results then
-- we may get duplicates (sdcv may do multiple queries,
-- in fixed mode then in fuzzy mode), we have to remove them
local h
for _,r in ipairs(results) do
h = r.dict .. r.word .. r.definition
if seen_results[h] == nil then
table.insert(final_results, r)
seen_results[h] = true
end
end
else
DEBUG("JSON data cannot be decoded", results)
end
end
if #final_results == 0 then
-- dummy results
final_results = {
{
dict = "",
word = word,
definition = _("No definition found."),
}
}
end
self:onLookupDone()
--DEBUG("lookup result table:", word, final_results)
self:showDict(word, tidyMarkup(final_results), box)
end
function ReaderDictionary:showDict(word, results, box)
if results and results[1] then
DEBUG("showing quick lookup window", word, results)
self.dict_window = DictQuickLookup:new{
window_list = self.dict_window_list,
ui = self.ui,
highlight = self.highlight,
dialog = self.dialog,
-- original lookup word
word = word,
results = results,
dictionary = self.default_dictionary,
width = Screen:getWidth() - Screen:scaleBySize(80),
word_box = box,
-- differentiate between dict and wiki
is_wiki = self.is_wiki,
wiki_languages = self.wiki_languages,
}
table.insert(self.dict_window_list, self.dict_window)
UIManager:show(self.dict_window)
end
end
function ReaderDictionary:onUpdateDefaultDict(dict)
DEBUG("make default dictionary:", dict)
self.default_dictionary = dict
UIManager:show(InfoMessage:new{
text = T(_("%1 is now the default dictionary for this document."), dict),
timeout = 2,
})
return true
end
function ReaderDictionary:onReadSettings(config)
self.default_dictionary = config:readSetting("default_dictionary")
end
function ReaderDictionary:onSaveSettings()
DEBUG("save default dictionary", self.default_dictionary)
self.ui.doc_settings:saveSetting("default_dictionary", self.default_dictionary)
end
return ReaderDictionary