koreader/plugins/japanese.koplugin/main.lua

--- Japanese language support for KOReader, modelled after Yomichan.
-- This plugin extends KOReader's built-in dictionary and selection system to
-- support Yomichan-style deinflection and text scanning, allowing for one-tap
-- searches of inflected verbs and multi-character words and phrases. As such,
-- this plugin removes the need for synonym-based deinflection rules for
-- StarDict-converted Japanese dictionaries.
--
-- @module koplugin.japanese
-- @alias Japanese

-- Copyright (C) 2021 Aleksa Sarai <cyphar@cyphar.com>
-- Licensed under the GPLv3 or later.
--
-- The deinflection logic is heavily modelled after Yomichan
-- <https://github.com/FooSoft/yomichan>, up to and including the deinflection
-- table. The way we try to find candidate words is also fairly similar (the
-- naive approach), though because dictionary lookups are quite expensive (we
-- have to call sdcv each time) we batch as many candidates as possible
-- together in order to reduce the impact we have on text selection.

local Deinflector = require("deinflector")
local LanguageSupport = require("languagesupport")
local ReaderDictionary = require("apps/reader/modules/readerdictionary")
local UIManager = require("ui/uimanager")
local WidgetContainer = require("ui/widget/container/widgetcontainer")
local logger = require("logger")
local util = require("util")
local _ = require("gettext")
local N_ = _.ngettext
local T = require("ffi/util").template

local SingleInstanceDeinflector = Deinflector:new{}

local Japanese = WidgetContainer:extend{
    name = "japanese",
    pretty_name = "Japanese",
}

-- Yomichan uses 10 characters as the default look-ahead, but crengine's
-- getNextVisibleChar counts furigana if any are present, so use a higher
-- threshold to be able to look-ahead an equivalent number of characters.
local DEFAULT_TEXT_SCAN_LENGTH = 20

function Japanese:init()
    self.deinflector = SingleInstanceDeinflector
    self.dictionary = (self.ui and self.ui.dictionary) or ReaderDictionary:new()
    self.max_scan_length = G_reader_settings:readSetting("language_japanese_text_scan_length") or DEFAULT_TEXT_SCAN_LENGTH
    LanguageSupport:registerPlugin(self)
end

function Japanese:supportsLanguage(language_code)
    return language_code == "ja" or language_code == "jpn"
end

--- Called from @{languagesupport.extraDictionaryFormCandidates} for Japanese
-- text. Tries to find and return any possible deinflections for the given text.
-- @param args arguments from language support
-- @treturn {string,...} extra dictionary form candiadates found (or nil)
-- @see languagesupport.extraDictionaryFormCandidates
-- @see languagesupport.registerPlugin
function Japanese:onWordLookup(args)
    local text = args.text

    -- If there are no CJK characters in the text, there's nothing to do.
    if not util.hasCJKChar(text) then
        return
    end

    --- @todo Try to repeatedly reduce the text and deinflect the shortened text
    --       to provide more candidates. This is particularly needed because
    --       JMDict has a habit of creating entries for compounds or phrases
    --       that do not exist in monolingual dictionaries (even in 大辞林 or
    --       広辞苑) and our onWordSelection expansion accepts any dictionary's
    --       largest entry. Unfortunately doing this nicely requires a bit of
    --       extra work to be efficient (since we need to remove the last
    --       character in the string).

    local results = self.deinflector:deinflect(text)
    logger.dbg("japanese.koplugin: deinflection of", text, "results:", results)

    --- @todo Pass up the reasons list (formatted Yomichan style) to the
    --       dictionary pop-up so you can get some more information about the
    --       inflection. But this would require adding some kind of tag
    --       metadata that we have to pass through from the lookup to the
    --       dictionary pop-up.

    local candidates = {}
    for i, result in ipairs(results) do
        candidates[i] = result.term
    end
    return candidates
end

-- @todo Switch this to utf8proc_category or something similar.
local JAPANESE_PUNCTUATION = "「」『』【】〘〙〖〗・･、､,。｡.！!？?　 \n"

local function isPossibleJapaneseWord(str)
    for c in str:gmatch(util.UTF8_CHAR_PATTERN) do
        if not util.isCJKChar(c) or JAPANESE_PUNCTUATION:find(c) ~= nil then
            return false
        end
    end
    return true
end

--- Called from @{languagesupport.improveWordSelection} for Japanese text.
-- Tries to expand the word selection defined by args.
-- @param args arguments from language support
-- @treturn {pos0,pos1} the new selection range (or nil)
-- @see languagesupport.improveWordSelection
-- @see languagesupport.registerPlugin
function Japanese:onWordSelection(args)
    local callbacks = args.callbacks
    local current_text = args.text

    -- If the initial selection contains only non-CJK characters, then there's
    -- no point trying to expand it because no Japanese words mix CJK and
    -- non-CJK characters (there are non-CJK words in Japanese -- CM, NG, TKG
    -- and their full-width equivalents for instance -- but they are selected
    -- by crengine correctly already and are full words by themselves).
    if current_text ~= "" and not util.hasCJKChar(current_text) then
        return
    end

    -- We reset the end of the range to pos0+1 because crengine will select
    -- half-width katakana (ｶﾀｶﾅ) in strange ways that often overshoots the
    -- end of words.
    local pos0, pos1 = args.pos0, callbacks.get_next_char_pos(args.pos0)

    -- We try to advance the end position until we hit a word.
    --
    -- Unfortunately it's possible for the complete word to be longer than the
    -- first match (obvious examples include 読み込む or similar compound verbs
    -- where it would be less than ideal to match 読み as the full word, but
    -- there are more subtle kana-only cases as well) so we need to keep
    -- looking forward, but unfortunately there isn't a great endpoint defined
    -- either (aside from punctuation). So we just copy Yomichan and set a hard
    -- limit (20 characters) and stop early if we ever hit punctuation. We then
    -- select the longest word present in one of the user's installed
    -- dictionaries (after deinflection).

    local all_candidates = {}
    local all_words = {}

    local current_end = pos1
    local num_expansions = 0
    repeat
        -- Move to the next character.
        current_end = callbacks.get_next_char_pos(current_end)
        current_text = callbacks.get_text_in_range(pos0, current_end)
        num_expansions = num_expansions + 1

        -- If the text could not be a complete Japanese word (i.e. it contains
        -- a punctuation or some other special character), quit early. We test
        -- the whole string rather than the last character because finding the
        -- last character requires a linear walk through the string anyway, and
        -- get_next_char_pos() skips over newlines.
        if not isPossibleJapaneseWord(current_text) then
            logger.dbg("japanese.koplugin: stopping expansion at", current_text, "because in contains non-word characters")
            break
        end

        -- Get the selection and try to deinflect it.
        local candidates = self.deinflector:deinflect(current_text)
        local terms = {}
        for _, candidate in ipairs(candidates) do
            table.insert(terms, candidate.term)
        end

        -- Add the candidates to the set of words to attempt.
        for _, term in ipairs(terms) do
            table.insert(all_candidates, {
                pos0 = pos0,
                pos1 = current_end,
                text = term,
            })
            table.insert(all_words, term)
        end
    until current_end == nil or num_expansions >= self.max_scan_length
    logger.dbg("japanese.koplugin: attempted", num_expansions, "expansions up to", current_text)

    -- Calling sdcv is fairly expensive, so reduce the cost by trying every
    -- candidate in one shot and then picking the longest one which gave us a
    -- result.
    --- @todo Given there is a limit to how many command-line arguments you can
    --       pass, we should split up the candidate list if it's too long.
    local best_word
    local cancelled, all_results = self.dictionary:rawSdcv(all_words)
    if not cancelled and all_results ~= nil then
        for i, term_results in ipairs(all_results) do
            if #term_results ~= 0 then
                best_word = all_candidates[i]
            end
        end
    end
    if best_word ~= nil then
        return {best_word.pos0, best_word.pos1}
    end
end

function Japanese:genMenuItem()
    local sub_item_table = {
        -- self.max_scan_length configuration
        {
            text_func = function()
                return T(N_("Text scan length: %1 character", "Text scan length: %1 characters", self.max_scan_length), self.max_scan_length)
            end,
            help_text = _("Number of characters to look ahead when trying to expand tap-and-hold word selection in documents."),
            keep_menu_open = true,
            callback = function(touchmenu_instance)
                local SpinWidget = require("ui/widget/spinwidget")
                local Screen = require("device").screen
                local items = SpinWidget:new{
                    title_text = _("Text scan length"),
                    info_text = T(_([[
The maximum number of characters to look ahead when trying to expand tap-and-hold word selection in documents.
Larger values allow longer phrases to be selected automatically, but with the trade-off that selections may become slower.

Default value: %1]]), DEFAULT_TEXT_SCAN_LENGTH),
                    width = math.floor(Screen:getWidth() * 0.75),
                    value = self.max_scan_length,
                    value_min = 0,
                    value_max = 1000,
                    value_step = 1,
                    value_hold_step = 10,
                    ok_text = _("Set scan length"),
                    default_value = DEFAULT_TEXT_SCAN_LENGTH,
                    callback = function(spin)
                        self.max_scan_length = spin.value
                        G_reader_settings:saveSetting("language_japanese_text_scan_length", self.max_scan_length)
                        if touchmenu_instance then touchmenu_instance:updateItems() end
                    end,
                }
                UIManager:show(items)
            end,
        },
    }
    -- self.deinflector configuration
    util.arrayAppend(sub_item_table, self.deinflector:genMenuItems())

    return {
        text = _("Japanese"),
        sub_item_table = sub_item_table,
    }
end

return Japanese