From d9c1df737b8b55172607b76d2c724a0ce457aa08 Mon Sep 17 00:00:00 2001 From: Ben Wiederhake Date: Thu, 7 Mar 2024 22:01:49 +0100 Subject: [PATCH] Correctly split three-or-more byte sequences of UTF-8 --- bridge/helper/helper.go | 14 ++++++++++++-- bridge/helper/helper_test.go | 9 +++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/bridge/helper/helper.go b/bridge/helper/helper.go index 0208dff1..d6488af6 100644 --- a/bridge/helper/helper.go +++ b/bridge/helper/helper.go @@ -211,8 +211,18 @@ func ClipMessage(text string, length int, clippingMessage string) string { if len(text) > length { text = text[:length-len(clippingMessage)] - if r, size := utf8.DecodeLastRuneInString(text); r == utf8.RuneError { - text = text[:len(text)-size] + for len(text) > 0 { + if r, _ := utf8.DecodeLastRuneInString(text); r == utf8.RuneError { + text = text[:len(text)-1] + // Note: DecodeLastRuneInString only returns the constant value "1" in + // case of an error. We do not yet know whether the last rune is now + // actually valid. Example: "€" is 0xE2 0x82 0xAC. If we happen to split + // the string just before 0xAC, and go back only one byte, that would + // leave us with a string that ends in the byte 0xE2, which is not a valid + // rune, so we need to try again. + } else { + break + } } text += clippingMessage } diff --git a/bridge/helper/helper_test.go b/bridge/helper/helper_test.go index 76e548e4..f21a4bda 100644 --- a/bridge/helper/helper_test.go +++ b/bridge/helper/helper_test.go @@ -88,6 +88,15 @@ var lineSplittingTestCases = map[string]struct { }, nonSplitOutput: []string{"不布人個我此而及單石業喜資富下我河下日沒一我臺空達的常景便物沒為……子大我別名解成?生賣的全直黑,我自我結毛分洲了世當,是政福那是東;斯說"}, }, + "Long message, clip three-byte rune after two bytes": { + input: "x 人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。", + splitOutput: []string{ + "x 人人生而自由,在尊嚴和權利上 ", + "一律平等。 他們都具有理性和良知 ", + ",應該以兄弟情誼的精神對待彼此。", + }, + nonSplitOutput: []string{"x 人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。"}, + }, } func TestGetSubLines(t *testing.T) {