diff --git a/frontend/util.lua b/frontend/util.lua
index c95e293fe..c4e71131a 100644
--- a/frontend/util.lua
+++ b/frontend/util.lua
@@ -188,10 +188,48 @@ function util.splitToChars(text)
     local tab = {}
     if text ~= nil then
         local prevcharcode, charcode = 0
+        -- Supports WTF-8 : https://en.wikipedia.org/wiki/UTF-8#WTF-8
+        -- a superset of UTF-8, that includes UTF-16 surrogates
+        -- in UTF-8 bytes (forbidden in well-formed UTF-8).
+        -- We may get that from bad producers or converters.
+        -- (luajson, used to decode Wikipedia API json, will not correctly decode
+        -- this sample: <span lang=\"got\">\ud800\udf45</span> : single Unicode
+        -- char https://www.compart.com/en/unicode/U+10345 and will give us
+        -- "\xed\xa0\x80\xed\xbd\x85" as UTF8, instead of the correct "\xf0\x90\x8d\x85")
+        -- From http://www.unicode.org/faq/utf_bom.html#utf16-1
+        --   Surrogates are code points from two special ranges of
+        --   Unicode values, reserved for use as the leading, and
+        --   trailing values of paired code units in UTF-16. Leading,
+        --   also called high, surrogates are from D800 to DBFF, and
+        --   trailing, or low, surrogates are from DC00 to DFFF. They
+        --   are called surrogates, since they do not represent
+        --   characters directly, but only as a pair.
+        local hi_surrogate
+        local hi_surrogate_uchar
         for uchar in string.gmatch(text, "([%z\1-\127\194-\244][\128-\191]*)") do
             charcode = BaseUtil.utf8charcode(uchar)
+            -- (not sure why we need this prevcharcode check; we could get
+            -- charcode=nil with invalid UTF-8, but should we then really
+            -- ignore the following charcode ?)
             if prevcharcode then -- utf8
-                table.insert(tab, uchar)
+                if charcode and charcode >= 0xD800 and charcode <= 0xDBFF then
+                    if hi_surrogate then -- previous unconsumed one, add it even if invalid
+                        table.insert(tab, hi_surrogate_uchar)
+                    end
+                    hi_surrogate = charcode
+                    hi_surrogate_uchar = uchar -- will be added if not followed by low surrogate
+                elseif hi_surrogate and charcode and charcode >= 0xDC00 and charcode <= 0xDFFF then
+                    -- low surrogate following a high surrogate, good, let's make them a single char
+                    charcode = (hi_surrogate - 0xD800) * 0x400 + (charcode - 0xDC00) + 0x10000
+                    table.insert(tab, util.unicodeCodepointToUtf8(charcode))
+                    hi_surrogate = nil
+                else
+                    if hi_surrogate then -- previous unconsumed one, add it even if invalid
+                        table.insert(tab, hi_surrogate_uchar)
+                    end
+                    hi_surrogate = nil
+                    table.insert(tab, uchar)
+                end
             end
             prevcharcode = charcode
         end