diff --git a/frontend/util.lua b/frontend/util.lua
index 80daa2929..c298a4f70 100644
--- a/frontend/util.lua
+++ b/frontend/util.lua
@@ -521,11 +521,16 @@ function util.lastIndexOf(string, ch)
     if i == nil then return -1 else return i - 1 end
 end
 
+--- Pattern which matches a single well-formed UTF-8 character, including
+--- theoretical >4-byte extensions.
+-- Taken from <https://www.lua.org/manual/5.4/manual.html#pdf-utf8.charpattern>
+util.UTF8_CHAR_PATTERN = '[%z\1-\127\194-\253][\128-\191]*'
+
 --- Reverse the individual greater-than-single-byte characters
 -- @string string to reverse
 -- Taken from <https://github.com/blitmap/lua-utf8-simple#utf8reverses>
 function util.utf8Reverse(text)
-    text = text:gsub('[%z\1-\127\194-\244][\128-\191]*', function (c) return #c > 1 and c:reverse() end)
+    text = text:gsub(util.UTF8_CHAR_PATTERN, function (c) return #c > 1 and c:reverse() end)
     return text:reverse()
 end
 
@@ -554,7 +559,7 @@ function util.splitToChars(text)
         --   characters directly, but only as a pair.
         local hi_surrogate
         local hi_surrogate_uchar
-        for uchar in string.gmatch(text, "([%z\1-\127\194-\244][\128-\191]*)") do
+        for uchar in text:gmatch(util.UTF8_CHAR_PATTERN) do
             charcode = BaseUtil.utf8charcode(uchar)
             -- (not sure why we need this prevcharcode check; we could get
             -- charcode=nil with invalid UTF-8, but should we then really
@@ -589,14 +594,47 @@ end
 ---- @string c
 ---- @treturn boolean true if CJK
 function util.isCJKChar(c)
-    return string.match(c, "[\228-\234][\128-\191].") == c
+    -- Smallest CJK codepoint is 0x1100 which requires at least 3 utf8 bytes to
+    -- encode (U+07FF is the largest codepoint that can be represented in 2
+    -- bytes with utf8). So if the character is shorter than 3 bytes it's
+    -- definitely not CJK and no need to decode it.
+    if #c < 3 then
+        return false
+    end
+    code = BaseUtil.utf8charcode(c)
+    -- The weird bracketing is intentional -- we use the lowest possible
+    -- codepoint as a shortcut so if the codepoint is below U+1100 we
+    -- immediately return false.
+    return -- BMP (Plane 0)
+            code >=  0x1100 and (code <=  0x11FF  or -- Hangul Jamo
+           (code >=  0x2E80 and  code <=  0x9FFF) or -- Numerous CJK Blocks (NB: has some gaps)
+           (code >=  0xA960 and  code <=  0xA97F) or -- Hangul Jamo Extended-A
+           (code >=  0xAC00 and  code <=  0xD7AF) or -- Hangul Syllables
+           (code >=  0xD7B0 and  code <=  0xD7FF) or -- Hangul Jame Extended-B
+           (code >=  0xF900 and  code <=  0xFAFF) or -- CJK Compatibility Ideographs
+           (code >=  0xFE30 and  code <=  0xFE4F) or -- CJK Compatibility Forms
+           (code >=  0xFF00 and  code <=  0xFFEF) or -- Halfwidth and Fullwidth Forms
+           -- SIP (Plane 2)
+           (code >= 0x20000 and  code <= 0x2A6DF) or -- CJK Unified Ideographs Extension B
+           (code >= 0x2A700 and  code <= 0x2B73F) or -- CJK Unified Ideographs Extension C
+           (code >= 0x2B740 and  code <= 0x2B81F) or -- CJK Unified Ideographs Extension D
+           (code >= 0x2B820 and  code <= 0x2CEAF) or -- CJK Unified Ideographs Extension E
+           (code >= 0x2CEB0 and  code <= 0x2EBEF) or -- CJK Unified Ideographs Extension F
+           (code >= 0x2F800 and  code <= 0x2FA1F) or -- CJK Compatibility Ideographs Supplement
+           -- TIP (Plane 3)
+           (code >= 0x30000 and  code <= 0x3134F))   -- CJK Unified Ideographs Extension G
 end
 
 --- Tests whether str contains CJK characters
 ---- @string str
 ---- @treturn boolean true if CJK
 function util.hasCJKChar(str)
-    return string.match(str, "[\228-\234][\128-\191].") ~= nil
+    for c in str:gmatch(util.UTF8_CHAR_PATTERN) do
+        if util.isCJKChar(c) then
+            return true
+        end
+    end
+    return false
 end
 
 --- Split texts into a list of words, spaces and punctuation marks.
@@ -607,8 +645,10 @@ function util.splitToWords(text)
     for word in util.gsplit(text, "[%s%p]+", true) do
         -- if space split word contains CJK characters
         if util.hasCJKChar(word) then
-            -- split with CJK characters
-            for char in util.gsplit(word, "[\228-\234\192-\255][\128-\191]+", true) do
+            -- split all non-ASCII characters separately (FIXME ideally we
+            -- would split only the CJK characters, but you cannot define CJK
+            -- characters trivially with a byte-only Lua pattern).
+            for char in util.gsplit(word, "[\192-\255][\128-\191]+", true) do
                 table.insert(wlist, char)
             end
         else
diff --git a/spec/unit/util_spec.lua b/spec/unit/util_spec.lua
index e2477ae95..34e38d96e 100644
--- a/spec/unit/util_spec.lua
+++ b/spec/unit/util_spec.lua
@@ -88,6 +88,23 @@ describe("util module", function()
                 "彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
             }, words)
         end)
+        it("should split Japanese words", function()
+            local words = util.splitToWords("色は匂へど散りぬるを我が世誰ぞ常ならむ")
+            assert.are_same({
+                "色","は","匂","へ","ど","散","り","ぬ","る","を",
+                "我","が","世","誰","ぞ","常","な","ら","む",
+            }, words)
+        end)
+        it("should split Korean words", function()
+			-- Technically splitting on spaces is correct but we treat Korean
+			-- as if it were any other CJK text.
+            local words = util.splitToWords("대한민국의 국기는 대한민국 국기법에 따라 태극기")
+            assert.are_same({
+                "대","한","민","국","의"," ","국","기","는"," ",
+                "대","한","민","국"," ","국","기","법","에"," ",
+                "따","라"," ","태","극","기",
+            }, words)
+        end)
         it("should split words of multilingual text", function()
             local words = util.splitToWords("BBC纪录片")
             assert.are_same({"BBC", "纪", "录", "片"}, words)
@@ -108,7 +125,7 @@ describe("util module", function()
                     table.insert(table_of_words, word)
                     word = ""
                 end
-                if i == #table_chars then table.insert(table_of_words, word) end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
             end
             assert.are_same({
                 "Pójdźże, ",
@@ -121,7 +138,7 @@ describe("util module", function()
                 "gavilán",
             }, table_of_words)
         end)
-        it("should split text to line - CJK", function()
+        it("should split text to line - CJK Chinese", function()
             local text = "彩虹是通过太阳光的折射引起的。"
             local word = ""
             local table_of_words = {}
@@ -134,12 +151,76 @@ describe("util module", function()
                     table.insert(table_of_words, word)
                     word = ""
                 end
-                if i == #table_chars then table.insert(table_of_words, word) end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
             end
             assert.are_same({
                 "彩","虹","是","通","过","太","阳","光","的","折","射","引","起","的","。",
             }, table_of_words)
         end)
+        it("should split text to line - CJK Japanese", function()
+            local text = "色は匂へど散りぬるを我が世誰ぞ常ならむ"
+            local word = ""
+            local table_of_words = {}
+            local c
+            local table_chars = util.splitToChars(text)
+            for i = 1, #table_chars  do
+                c = table_chars[i]
+                word = word .. c
+                if util.isSplittable(c) then
+                    table.insert(table_of_words, word)
+                    word = ""
+                end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
+            end
+            assert.are_same({
+                "色","は","匂","へ","ど","散","り","ぬ","る","を",
+                "我","が","世","誰","ぞ","常","な","ら","む",
+            }, table_of_words)
+        end)
+        it("should split text to line - CJK Korean", function()
+            local text = "대한민국의 국기는 대한민국 국기법에 따라 태극기"
+            local word = ""
+            local table_of_words = {}
+            local c
+            local table_chars = util.splitToChars(text)
+            for i = 1, #table_chars  do
+                c = table_chars[i]
+                word = word .. c
+                if util.isSplittable(c) then
+                    table.insert(table_of_words, word)
+                    word = ""
+                end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
+            end
+            assert.are_same({
+                "대","한","민","국","의"," ","국","기","는"," ",
+                "대","한","민","국"," ","국","기","법","에"," ",
+                "따","라"," ","태","극","기",
+            }, table_of_words)
+        end)
+        it("should split text to line - mixed CJK and latin", function()
+            local text = "This is Russian: русский язык, Chinese: 汉语, Japanese: 日本語、 Korean: 한국어。"
+            local word = ""
+            local table_of_words = {}
+            local c
+            local table_chars = util.splitToChars(text)
+            for i = 1, #table_chars  do
+                c = table_chars[i]
+                word = word .. c
+                if util.isSplittable(c) then
+                    table.insert(table_of_words, word)
+                    word = ""
+                end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
+            end
+            assert.are_same({
+                "This ", "is ",
+                "Russian: ", "русский ", "язык, ",
+                "Chinese: ", "汉","语",", ",
+                "Japanese: ", "日","本","語","、", " ",
+                "Korean: ", "한","국","어","。",
+            }, table_of_words)
+        end)
         it("should split text to line with next_c - unicode", function()
             local text = "Ce test : 1) est très simple ; 2 ) simple comme ( 2/2 ) > 50 % ? ok."
             local word = ""
@@ -154,7 +235,7 @@ describe("util module", function()
                     table.insert(table_of_words, word)
                     word = ""
                 end
-                if i == #table_chars then table.insert(table_of_words, word) end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
             end
             assert.are_same({
                 "Ce ",
@@ -187,7 +268,7 @@ describe("util module", function()
                     table.insert(table_of_words, word)
                     word = ""
                 end
-                if i == #table_chars then table.insert(table_of_words, word) end
+                if i == #table_chars and word ~= "" then table.insert(table_of_words, word) end
             end
             assert.are_same({
                 "Ce ",