mirror of
https://github.com/koreader/koreader
synced 2024-11-13 19:11:25 +00:00
da65ac8b02
* Iterate over varargs directly via select if possible * Use table.pack otherwise (https://github.com/koreader/koreader-base/pull/1535). * This allows us to simplify a few Logger calls, as logger now handles nil values.
473 lines
19 KiB
Lua
473 lines
19 KiB
Lua
--- Yomichan deinflector implementation in pure Lua.
|
||
-- This is very heavily modelled after Yomichan's deinflection code, with some
|
||
-- minor changes to make it slightly more performant in the more restricted
|
||
-- environment KOReader tends to run in.
|
||
--
|
||
-- @module koplugin.japanese.deinflector
|
||
-- @alias Deinflector
|
||
|
||
-- Copyright (C) 2021 Aleksa Sarai <cyphar@cyphar.com>
|
||
-- Licensed under the GPLv3 or later.
|
||
--
|
||
-- This deinflection logic is heavily modelled after Yomichan
|
||
-- <https://github.com/FooSoft/yomichan>, up to and including the deinflection
|
||
-- table.
|
||
|
||
local InfoMessage = require("ui/widget/infomessage")
|
||
local JSON = require("rapidjson")
|
||
local UIManager = require("ui/uimanager")
|
||
local Utf8Proc = require("ffi/utf8proc")
|
||
local bit = require("bit")
|
||
local logger = require("logger")
|
||
local util = require("util")
|
||
local _ = require("gettext")
|
||
local N_ = _.ngettext
|
||
local T = require("ffi/util").template
|
||
|
||
local Deinflector = {}
|
||
|
||
local RULE_TYPES = {
|
||
["v1"] = 0x01, -- Verb ichidan (so-called ru-verb)
|
||
["v5"] = 0x02, -- Verb godan (so-called u-verb)
|
||
["vs"] = 0x04, -- Verb suru
|
||
["vk"] = 0x08, -- Verb kuru
|
||
["vz"] = 0x0A, -- Verb zuru
|
||
["adj-i"] = 0x10, -- Adjectival verb (i-adjective)
|
||
["iru"] = 0x20, -- Intermediate -iru endings for progressive or perfect tense
|
||
}
|
||
|
||
local function toRuleTypes(...)
|
||
local final = 0
|
||
for i = 1, select("#", ...) do
|
||
local ruleType = select(i, ...)
|
||
if RULE_TYPES[ruleType] then
|
||
final = bit.bor(final, RULE_TYPES[ruleType])
|
||
end
|
||
end
|
||
return final
|
||
end
|
||
|
||
local function getSourceDir()
|
||
local callerSource = debug.getinfo(2, "S").source
|
||
if callerSource:find("^@") then
|
||
return callerSource:gsub("^@(.*)/[^/]*", "%1")
|
||
end
|
||
end
|
||
|
||
local function parsePluginJson(filename)
|
||
local jsonPath = getSourceDir().."/"..filename
|
||
local file, err = io.open(jsonPath, "r")
|
||
if file then
|
||
local contents = file:read("*all")
|
||
file:close()
|
||
local ok, parsed = pcall(JSON.decode, contents)
|
||
if ok then
|
||
return parsed
|
||
else
|
||
logger.err("japanese.koplugin: failed to parse plugin json", filename)
|
||
end
|
||
else
|
||
logger.err("japanese.koplugin: failed to open plugin json", filename, err)
|
||
end
|
||
return {}
|
||
end
|
||
|
||
--- A single deinflection result.
|
||
-- @field term Deinflected form of the term (string).
|
||
-- @field rules Rules bitmask the term has applied (int).
|
||
-- @field reasons Array of reasons applied to reach the term ({string,...}).
|
||
-- @table DeinflectResult
|
||
|
||
local function makeDeinflectionResult(term, rules, reasons)
|
||
return { term = term, rules = rules, reasons = reasons }
|
||
end
|
||
|
||
--- Deinflect some text as-is (without trying any possible conversions between
|
||
-- types of kana or any other such modifications). You probably want to use
|
||
-- Deinflector:deinflect() because it is more thorough.
|
||
--
|
||
-- @see deinflect
|
||
-- @tparam string text Japanese text to deinflect verbatim.
|
||
-- @treturn {DeinflectResult,...} An array of possible deinflections (including the text given).
|
||
function Deinflector:deinflectVerbatim(text)
|
||
self:init() -- in case this is being called directly
|
||
local results = {makeDeinflectionResult(text, 0, {})}
|
||
local seen = {}
|
||
seen[text] = true
|
||
for _, current in ipairs(results) do
|
||
for reason, rules in pairs(self.rules) do
|
||
for _, rule in ipairs(rules) do
|
||
local rulesMatch = current.rules == 0 or bit.band(current.rules, rule.rulesIn) ~= 0
|
||
local endsWithKana = current.term:sub(-#rule.kanaIn) == rule.kanaIn
|
||
local longEnough = #current.term - #rule.kanaIn + #rule.kanaOut > 0
|
||
if rulesMatch and endsWithKana and longEnough then
|
||
-- Check if we've already found this deinflection. If so,
|
||
-- that means there was a shorter reason path to it and
|
||
-- this deinflection is almost certainly theoretical.
|
||
local new_term = current.term:sub(1, -#rule.kanaIn-1) .. rule.kanaOut
|
||
if not seen[new_term] then
|
||
table.insert(results, makeDeinflectionResult(
|
||
new_term,
|
||
rule.rulesOut,
|
||
{reason, unpack(current.reasons)}
|
||
))
|
||
seen[new_term] = true
|
||
end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
return results
|
||
end
|
||
|
||
-- These are all in 五十音 order, but we list variants in their 五十音 order
|
||
-- before the base kana.
|
||
-- @todo Maybe add historic (ゐ, ゑ) or lingustic (う゚, か゚, さ゚, ら゚) kana too?
|
||
|
||
local FULLWIDTH_HIRAGANA = {
|
||
-- 小書き
|
||
"ぁ", "ぃ", "ぅ", "ぇ", "ぉ",
|
||
"ゕ", "ゖ",
|
||
"っ",
|
||
"ゃ", "ゅ", "ょ",
|
||
"ゎ",
|
||
-- 濁点・半濁点 つき
|
||
"が", "ぎ", "ぐ", "げ", "ご",
|
||
"ざ", "じ", "ず", "ぜ", "ぞ",
|
||
"だ", "ぢ", "づ", "で", "ど",
|
||
"ば", "び", "ぶ", "べ", "ぼ",
|
||
"ぱ", "ぴ", "ぷ", "ぺ", "ぽ",
|
||
"わ゙", "ゔ", "を゙",
|
||
-- 五十音
|
||
"あ", "い", "う", "え", "お",
|
||
"か", "き", "く", "け", "こ",
|
||
"さ", "し", "す", "せ", "そ",
|
||
"た", "ち", "つ", "て", "と",
|
||
"な", "に", "ぬ", "ね", "の",
|
||
"は", "ひ", "ふ", "へ", "ほ",
|
||
"ま", "み", "む", "め", "も",
|
||
"や", "ゆ", "よ",
|
||
"ら", "り", "る", "れ", "ろ",
|
||
"わ", "を",
|
||
-- 撥音と長音符
|
||
"ん", "ー",
|
||
}
|
||
|
||
local FULLWIDTH_KATAKANA = {
|
||
-- 小書き
|
||
"ァ", "ィ", "ゥ", "ェ", "ォ",
|
||
"ヵ", "ヶ",
|
||
"ッ",
|
||
"ャ", "ュ", "ョ",
|
||
"ヮ",
|
||
-- 濁点・半濁点 つき
|
||
"ガ", "ギ", "グ", "ゲ", "ゴ",
|
||
"ザ", "ジ", "ズ", "ゼ", "ゾ",
|
||
"ダ", "ヂ", "ヅ", "デ", "ド",
|
||
"バ", "ビ", "ブ", "ベ", "ボ",
|
||
"パ", "ピ", "プ", "ペ", "ポ",
|
||
"ヷ", "ヴ", "ヺ",
|
||
-- 五十音
|
||
"ア", "イ", "ウ", "エ", "オ",
|
||
"カ", "キ", "ク", "ケ", "コ",
|
||
"サ", "シ", "ス", "セ", "ソ",
|
||
"タ", "チ", "ツ", "テ", "ト",
|
||
"ナ", "ニ", "ヌ", "ネ", "ノ",
|
||
"ハ", "ヒ", "フ", "ヘ", "ホ",
|
||
"マ", "ミ", "ム", "メ", "モ",
|
||
"ヤ", "ユ", "ヨ",
|
||
"ラ", "リ", "ル", "レ", "ロ",
|
||
"ワ", "ヲ",
|
||
-- 撥音と長音符
|
||
"ン", "ー",
|
||
}
|
||
|
||
local HALFWIDTH_KATAKANA = {
|
||
-- 小書き
|
||
"ァ", "ィ", "ゥ", "ェ", "ォ",
|
||
"", "", -- no ヵ・ヶ (small か・け)
|
||
"ッ",
|
||
"ャ", "ュ", "ョ",
|
||
"", -- no ゎ (small わ)
|
||
-- 濁点・半濁点 つき
|
||
"ガ", "ギ", "グ", "ゲ", "ゴ",
|
||
"ザ", "ジ", "ズ", "ゼ", "ゾ",
|
||
"ダ", "ヂ", "ヅ", "デ", "ド",
|
||
"バ", "ビ", "ブ", "ベ", "ボ",
|
||
"パ", "ピ", "プ", "ペ", "ポ",
|
||
"ヷ", "ヴ", "ヺ",
|
||
-- 五十音
|
||
"ア", "イ", "ウ", "エ", "オ",
|
||
"カ", "キ", "ク", "ケ", "コ",
|
||
"サ", "シ", "ス", "セ", "ソ",
|
||
"タ", "チ", "ツ", "テ", "ト",
|
||
"ナ", "ニ", "ヌ", "ネ", "ノ",
|
||
"ハ", "ヒ", "フ", "ヘ", "ホ",
|
||
"マ", "ミ", "ム", "メ", "モ",
|
||
"ヤ", "ユ", "ヨ",
|
||
"ラ", "リ", "ル", "レ", "ロ",
|
||
"ワ", "ヲ",
|
||
-- 撥音と長音符
|
||
"ン", "ー",
|
||
}
|
||
|
||
-- Ensure all of the tables are normalised.
|
||
for i, c in ipairs(HALFWIDTH_KATAKANA) do HALFWIDTH_KATAKANA[i] = Utf8Proc.normalize_NFC(c) end
|
||
for i, c in ipairs(FULLWIDTH_KATAKANA) do FULLWIDTH_KATAKANA[i] = Utf8Proc.normalize_NFC(c) end
|
||
for i, c in ipairs(FULLWIDTH_HIRAGANA) do FULLWIDTH_HIRAGANA[i] = Utf8Proc.normalize_NFC(c) end
|
||
-- Ensure all tables are the same size.
|
||
assert(#HALFWIDTH_KATAKANA == #FULLWIDTH_KATAKANA)
|
||
assert(#FULLWIDTH_KATAKANA == #FULLWIDTH_HIRAGANA)
|
||
-- Create fast conversion tables.
|
||
local HALFWIDTH_TO_FULLWIDTH, KATAKANA_TO_HIRAGANA, HIRAGANA_TO_KATAKANA = {}, {}, {}
|
||
for i in ipairs(FULLWIDTH_KATAKANA) do
|
||
KATAKANA_TO_HIRAGANA[FULLWIDTH_KATAKANA[i]] = FULLWIDTH_HIRAGANA[i]
|
||
HIRAGANA_TO_KATAKANA[FULLWIDTH_HIRAGANA[i]] = FULLWIDTH_KATAKANA[i]
|
||
-- Some entries are "" but that doesn't matter since we won't hit them during conversion.
|
||
HALFWIDTH_TO_FULLWIDTH[HALFWIDTH_KATAKANA[i]] = FULLWIDTH_KATAKANA[i]
|
||
end
|
||
|
||
local function kana_mapper(map)
|
||
return function(text)
|
||
local new_text = {}
|
||
local last_char
|
||
for c in text:gmatch(util.UTF8_CHAR_PATTERN) do
|
||
if last_char and (c == "゙" or c == "゚") then
|
||
-- Replace the last character with the correct mapping for the
|
||
-- combined character and mark. This is needed specifically for
|
||
-- half-width kana.
|
||
if map[last_char .. c] then
|
||
new_text[#new_text] = map[last_char .. c]
|
||
end
|
||
else
|
||
table.insert(new_text, map[c] or c)
|
||
end
|
||
last_char = c
|
||
end
|
||
return {table.concat(new_text, "")}
|
||
end
|
||
end
|
||
|
||
local EMPHATIC_SYMBOLS = {
|
||
["っ"] = true, ["ッ"] = true,
|
||
["ー"] = true, ["〜"] = true,
|
||
}
|
||
|
||
local function collapse_emphatic(text)
|
||
local complete_collapse, partial_collapse = {}, {}
|
||
local last_char
|
||
for c in text:gmatch(util.UTF8_CHAR_PATTERN) do
|
||
if not EMPHATIC_SYMBOLS[c] then
|
||
table.insert(partial_collapse, c)
|
||
table.insert(complete_collapse, c)
|
||
elseif last_char ~= c then -- first instance of this emphatic marker
|
||
table.insert(partial_collapse, c)
|
||
end
|
||
last_char = c
|
||
end
|
||
return {
|
||
table.concat(partial_collapse, ""),
|
||
table.concat(complete_collapse, ""),
|
||
}
|
||
end
|
||
|
||
--- The set of defined map functions available to the deinflector.
|
||
local ALL_TEXT_CONVERSIONS = {
|
||
{
|
||
name = "halfwidth_to_fullwidth",
|
||
pretty_name = _("Halfwidth to fullwidth kana"),
|
||
-- @translators If possible, keep the example Japanese text.
|
||
help_text = _("Convert half-width katakana to full-width katakana (for instance, カタカナ will be converted to カタカナ)."),
|
||
func = kana_mapper(HALFWIDTH_TO_FULLWIDTH),
|
||
},
|
||
{
|
||
name = "hiragana_to_katakana",
|
||
pretty_name = _("Hiragana to katakana"),
|
||
-- @translators If possible, keep the example Japanese text.
|
||
help_text = _("Convert hiragana to katakana (for instance, ひらがな will be converted to ヒラガナ)."),
|
||
func = kana_mapper(HIRAGANA_TO_KATAKANA),
|
||
},
|
||
{
|
||
name = "katakana_to_hiragana",
|
||
pretty_name = _("Katakana to hiragana"),
|
||
-- @translators If possible, keep the example Japanese text.
|
||
help_text = _("Convert katakana to hiragana (for instance, カタカナ will be converted to かたかな)."),
|
||
func = kana_mapper(KATAKANA_TO_HIRAGANA),
|
||
},
|
||
{
|
||
name = "collapse_emphatic",
|
||
pretty_name = _("Collapse emphatic sequences"),
|
||
-- @translators If possible, keep the example Japanese text.
|
||
help_text = _("Collapse any character sequences which are sometimes used as emphasis in speech (for instance, すっっごーーい will be converted to both すっごーい and すごい)."),
|
||
func = collapse_emphatic,
|
||
},
|
||
}
|
||
|
||
--- Default enabled/disabled settings for ALL_TEXT_CONVERSIONS.
|
||
local DEFAULT_TEXT_CONVERSIONS = {
|
||
["halfwidth_to_fullwidth"] = true,
|
||
["hiragana_to_katakana"] = false,
|
||
["katakana_to_hiragana"] = true,
|
||
["collapse_emphatic"] = false,
|
||
}
|
||
|
||
--- Return the set of deinflections (and the reason path taken) for the
|
||
-- provided text. In addition to the verbatim text provided, several cleanups
|
||
-- will be attempted on the text (conversion from half-width kana, conversion
|
||
-- between katakana and hiragana, and collapsing of any emphatic sequences) and
|
||
-- any valid deinflections found will also be returned.
|
||
--
|
||
-- @tparam string text Japanese text to deinflect.
|
||
-- @treturn {DeinflectResult,...} An array of possible deinflections (including the text given).
|
||
function Deinflector:deinflect(text)
|
||
-- Normalise the text to ensure that we handle full-width text that
|
||
-- inexplicably uses combining 濁点・半濁点 (◌゙・◌゚) marks.
|
||
text = Utf8Proc.normalize_NFC(util.fixUtf8(text, "<EFBFBD>"))
|
||
local seen = {}
|
||
local all_results = {}
|
||
-- Iterate over the powerset of text_conversions by looping over every
|
||
-- possible bitmask for text_conversions then applying the functions which
|
||
-- have their corresponding bit set in the mask.
|
||
local enabled_text_conversions = {}
|
||
for name, enabled in pairs(self.enabled_text_conversions) do
|
||
if enabled then table.insert(enabled_text_conversions, name) end
|
||
end
|
||
local max_mapfn_bitmask = bit.lshift(1, #enabled_text_conversions) - 1 -- (2^n - 1)
|
||
for mapfn_bitmask = 0, max_mapfn_bitmask do
|
||
local func_names = {}
|
||
for i, func_name in ipairs(enabled_text_conversions) do
|
||
local mapfn_bit = bit.lshift(1, i-1) -- the bit for this function
|
||
if bit.band(mapfn_bit, mapfn_bitmask) ~= 0 then
|
||
func_names[func_name] = true
|
||
end
|
||
end
|
||
-- Apply the converters in the order specified in ALL_TEXT_CONVERSIONS.
|
||
local mapped_texts = {text}
|
||
for _, converter in ipairs(ALL_TEXT_CONVERSIONS) do
|
||
if func_names[converter.name] then
|
||
local old_texts = mapped_texts
|
||
mapped_texts = {}
|
||
for _, old_text in ipairs(old_texts) do
|
||
util.arrayAppend(mapped_texts, converter.func(old_text))
|
||
end
|
||
end
|
||
end
|
||
for _, mapped_text in ipairs(mapped_texts) do
|
||
if not seen[mapped_text] then
|
||
if text ~= mapped_text then
|
||
logger.dbg("japanese.koplugin deinflector: trying converted variant", text, "->", mapped_text)
|
||
end
|
||
local results = self:deinflectVerbatim(mapped_text)
|
||
if results then
|
||
util.arrayAppend(all_results, results)
|
||
end
|
||
seen[mapped_text] = true
|
||
end
|
||
end
|
||
end
|
||
return all_results
|
||
end
|
||
|
||
function Deinflector:genTextConversionMenuItems()
|
||
local item_table = {}
|
||
for _, conversion in pairs(ALL_TEXT_CONVERSIONS) do
|
||
local name = conversion.name
|
||
table.insert(item_table, {
|
||
text = conversion.pretty_name,
|
||
help_text = conversion.help_text,
|
||
checked_func = function()
|
||
return self.enabled_text_conversions[name] or false
|
||
end,
|
||
callback = function(touchmenu_instance)
|
||
self.enabled_text_conversions[name] = not self.enabled_text_conversions[name]
|
||
G_reader_settings:saveSetting("language_japanese_text_conversions", self.enabled_text_conversions)
|
||
if touchmenu_instance then touchmenu_instance:updateItems() end
|
||
end,
|
||
})
|
||
end
|
||
return item_table
|
||
end
|
||
|
||
function Deinflector:genMenuItems()
|
||
return {
|
||
{
|
||
text_func = function()
|
||
local nenabled = 0
|
||
for _, enabled in pairs(self.enabled_text_conversions) do
|
||
if enabled then nenabled = nenabled + 1 end
|
||
end
|
||
if nenabled == 0 then
|
||
return _("Text conversions: none enabled")
|
||
else
|
||
return T(N_("Text conversions: %1 enabled", "Text conversions: %1 enabled", nenabled), nenabled)
|
||
end
|
||
end,
|
||
help_text = _([[
|
||
Configure which text conversions to apply when trying to deinflect Japanese text. These primarily include conversions between different kinds of kana, in order to make sure that a word written using different kana to your installed dictionaries can still be looked up.
|
||
|
||
Not every conversion will be applied at once. Instead, all possible combinations of enabled conversions will be attempted in order to maximise the chance of at least one conversion matching the form used in the dictionary.]]),
|
||
sub_item_table = self:genTextConversionMenuItems(),
|
||
},
|
||
{
|
||
-- @translators A deinflector is a program which converts a word into its dictionary form, similar to deconjugation in European languages. See <https://en.wikipedia.org/wiki/Japanese_verb_conjugation> for more detail.
|
||
text = _("Deinflector information"),
|
||
keep_menu_open = true,
|
||
callback = function()
|
||
local nrules, nvariants = 0, 0
|
||
for _, rules in pairs(self.rules) do
|
||
nvariants = nvariants + #rules
|
||
nrules = nrules + 1
|
||
end
|
||
local nrules_str = T(N_("%1 rule", "%1 rules", nrules), nrules)
|
||
local nvariants_str = T(N_("%1 variant", "%1 variants", nvariants), nvariants)
|
||
UIManager:show(InfoMessage:new{
|
||
-- @translators %1 is the "%1 rule(s)" string, %2 is the "%1 variant(s)" string.
|
||
text = T(_("Deinflector has %1 and %2 loaded."), nrules_str, nvariants_str),
|
||
})
|
||
end,
|
||
},
|
||
}
|
||
end
|
||
|
||
--- Initialise a Deflector instance with the set of rules defined in
|
||
-- yomichan-deflect.json.
|
||
function Deinflector:init()
|
||
self.enabled_text_conversions = self.enabled_text_conversions or
|
||
G_reader_settings:readSetting("language_japanese_text_conversions") or
|
||
DEFAULT_TEXT_CONVERSIONS
|
||
if self.rules ~= nil then return end -- already loaded
|
||
|
||
--- @todo Maybe make this location configurable or look in the user-controlled data directory too?
|
||
local inflections = parsePluginJson("yomichan-deinflect.json")
|
||
|
||
-- Normalise the reasons and convert the rules to the rule_types bitflags.
|
||
self.rules = {}
|
||
local nrules, nvariants = 0, 0
|
||
for reason, rules in pairs(inflections) do
|
||
local variants = {}
|
||
for i, variant in ipairs(rules) do
|
||
variants[i] = {
|
||
kanaIn = variant.kanaIn,
|
||
kanaOut = variant.kanaOut,
|
||
rulesIn = toRuleTypes(unpack(variant.rulesIn)),
|
||
rulesOut = toRuleTypes(unpack(variant.rulesOut)),
|
||
}
|
||
end
|
||
self.rules[reason] = variants
|
||
nrules = nrules + 1
|
||
nvariants = nvariants + #variants
|
||
end
|
||
logger.dbg("japanese.koplugin deinflector: loaded inflection table with", nrules, "rules and", nvariants, "variants")
|
||
end
|
||
|
||
--- Create a new Deflector instance.
|
||
function Deinflector:new(o)
|
||
o = o or {}
|
||
setmetatable(o, self)
|
||
self.__index = self
|
||
o:init()
|
||
return o
|
||
end
|
||
|
||
return Deinflector
|