mirror of
https://github.com/koreader/koreader
synced 2024-11-10 01:10:34 +00:00
util.splitToChars(): supports text encoded in WTF-8
https://en.wikipedia.org/wiki/UTF-8#WTF-8 WTF-8 is a superset of UTF-8, that includes UTF-16 surrogates in UTF-8 bytes (forbidden in well-formed UTF-8). We may get UTF-8 with these from bad producers or converters. We can get such chars in the text we get from Wikipedia API once their (fully valid) JSON has been decoded by our lpeg-based JSON decoder (which is a defect, hard to fix). (Our other pure-Lua json decoder has no problem and do that correctly). We might also find these WTF-8 in some dictionaries, so let's support them.
This commit is contained in:
parent
19280078de
commit
d98ea4e9ee
@ -188,10 +188,48 @@ function util.splitToChars(text)
|
|||||||
local tab = {}
|
local tab = {}
|
||||||
if text ~= nil then
|
if text ~= nil then
|
||||||
local prevcharcode, charcode = 0
|
local prevcharcode, charcode = 0
|
||||||
|
-- Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8
|
||||||
|
-- a superset of UTF-8, that includes UTF-16 surrogates
|
||||||
|
-- in UTF-8 bytes (forbidden in well-formed UTF-8).
|
||||||
|
-- We may get that from bad producers or converters.
|
||||||
|
-- (luajson, used to decode Wikipedia API json, will not correctly decode
|
||||||
|
-- this sample: <span lang=\"got\">\ud800\udf45</span> : single Unicode
|
||||||
|
-- char https://www.compart.com/en/unicode/U+10345 and will give us
|
||||||
|
-- "\xed\xa0\x80\xed\xbd\x85" as UTF8, instead of the correct "\xf0\x90\x8d\x85")
|
||||||
|
-- From http://www.unicode.org/faq/utf_bom.html#utf16-1
|
||||||
|
-- Surrogates are code points from two special ranges of
|
||||||
|
-- Unicode values, reserved for use as the leading, and
|
||||||
|
-- trailing values of paired code units in UTF-16. Leading,
|
||||||
|
-- also called high, surrogates are from D800 to DBFF, and
|
||||||
|
-- trailing, or low, surrogates are from DC00 to DFFF. They
|
||||||
|
-- are called surrogates, since they do not represent
|
||||||
|
-- characters directly, but only as a pair.
|
||||||
|
local hi_surrogate
|
||||||
|
local hi_surrogate_uchar
|
||||||
for uchar in string.gmatch(text, "([%z\1-\127\194-\244][\128-\191]*)") do
|
for uchar in string.gmatch(text, "([%z\1-\127\194-\244][\128-\191]*)") do
|
||||||
charcode = BaseUtil.utf8charcode(uchar)
|
charcode = BaseUtil.utf8charcode(uchar)
|
||||||
|
-- (not sure why we need this prevcharcode check; we could get
|
||||||
|
-- charcode=nil with invalid UTF-8, but should we then really
|
||||||
|
-- ignore the following charcode ?)
|
||||||
if prevcharcode then -- utf8
|
if prevcharcode then -- utf8
|
||||||
table.insert(tab, uchar)
|
if charcode and charcode >= 0xD800 and charcode <= 0xDBFF then
|
||||||
|
if hi_surrogate then -- previous unconsumed one, add it even if invalid
|
||||||
|
table.insert(tab, hi_surrogate_uchar)
|
||||||
|
end
|
||||||
|
hi_surrogate = charcode
|
||||||
|
hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate
|
||||||
|
elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then
|
||||||
|
-- low surrogate following a high surrogate, good, let's make them a single char
|
||||||
|
charcode = (hi_surrogate - 0xD800) * 0x400 + (charcode - 0xDC00) + 0x10000
|
||||||
|
table.insert(tab, util.unicodeCodepointToUtf8(charcode))
|
||||||
|
hi_surrogate = nil
|
||||||
|
else
|
||||||
|
if hi_surrogate then -- previous unconsumed one, add it even if invalid
|
||||||
|
table.insert(tab, hi_surrogate_uchar)
|
||||||
|
end
|
||||||
|
hi_surrogate = nil
|
||||||
|
table.insert(tab, uchar)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
prevcharcode = charcode
|
prevcharcode = charcode
|
||||||
end
|
end
|
||||||
|
Loading…
Reference in New Issue
Block a user