diff --git a/frontend/util.lua b/frontend/util.lua index c95e293fe..c4e71131a 100644 --- a/frontend/util.lua +++ b/frontend/util.lua @@ -188,10 +188,48 @@ function util.splitToChars(text) local tab = {} if text ~= nil then local prevcharcode, charcode = 0 + -- Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8 + -- a superset of UTF-8, that includes UTF-16 surrogates + -- in UTF-8 bytes (forbidden in well-formed UTF-8). + -- We may get that from bad producers or converters. + -- (luajson, used to decode Wikipedia API json, will not correctly decode + -- this sample: \ud800\udf45 : single Unicode + -- char https://www.compart.com/en/unicode/U+10345 and will give us + -- "\xed\xa0\x80\xed\xbd\x85" as UTF8, instead of the correct "\xf0\x90\x8d\x85") + -- From http://www.unicode.org/faq/utf_bom.html#utf16-1 + -- Surrogates are code points from two special ranges of + -- Unicode values, reserved for use as the leading, and + -- trailing values of paired code units in UTF-16. Leading, + -- also called high, surrogates are from D800 to DBFF, and + -- trailing, or low, surrogates are from DC00 to DFFF. They + -- are called surrogates, since they do not represent + -- characters directly, but only as a pair. + local hi_surrogate + local hi_surrogate_uchar for uchar in string.gmatch(text, "([%z\1-\127\194-\244][\128-\191]*)") do charcode = BaseUtil.utf8charcode(uchar) + -- (not sure why we need this prevcharcode check; we could get + -- charcode=nil with invalid UTF-8, but should we then really + -- ignore the following charcode ?) if prevcharcode then -- utf8 - table.insert(tab, uchar) + if charcode and charcode >= 0xD800 and charcode <= 0xDBFF then + if hi_surrogate then -- previous unconsumed one, add it even if invalid + table.insert(tab, hi_surrogate_uchar) + end + hi_surrogate = charcode + hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate + elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then + -- low surrogate following a high surrogate, good, let's make them a single char + charcode = (hi_surrogate - 0xD800) * 0x400 + (charcode - 0xDC00) + 0x10000 + table.insert(tab, util.unicodeCodepointToUtf8(charcode)) + hi_surrogate = nil + else + if hi_surrogate then -- previous unconsumed one, add it even if invalid + table.insert(tab, hi_surrogate_uchar) + end + hi_surrogate = nil + table.insert(tab, uchar) + end end prevcharcode = charcode end