diff --git a/src/lib/egcpool.h b/src/lib/egcpool.h index 9b1ba613e..bfeaa40d1 100644 --- a/src/lib/egcpool.h +++ b/src/lib/egcpool.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "notcurses/notcurses.h" @@ -59,43 +60,42 @@ egcpool_grow(egcpool* pool, size_t len){ return 0; } -// Eat an EGC from the UTF-8 string input. This consists of extracting a -// multibyte via mbrtowc, then continuing to extract any which have zero -// width until hitting another spacing character or a NUL terminator. Writes -// the number of columns occupied to '*colcount'. Returns the number of bytes -// consumed, not including any NUL terminator. Note that neither the number -// of bytes nor columns is necessarily equivalent to the number of decoded code -// points. Such are the ways of Unicode. +// Eat an EGC from the UTF-8 string input, counting bytes and columns. We use +// libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of +// columns to '*colcount'. Returns the number of bytes consumed, not including +// any NUL terminator. Neither the number of bytes nor columns is necessarily +// equal to the number of decoded code points. Such are the ways of Unicode. +// uc_is_grapheme_break() wants UTF-32, which is fine, because we need wchar_t +// to use wcwidth() anyway FIXME except this doesn't work with 16-bit wchar_t! static inline int utf8_egc_len(const char* gcluster, int* colcount){ size_t ret = 0; *colcount = 0; - wchar_t wc; int r; mbstate_t mbt; memset(&mbt, 0, sizeof(mbt)); + wchar_t wc, prevw = 0; do{ r = mbrtowc(&wc, gcluster, MB_CUR_MAX, &mbt); - if(r < 0){ - return -1; - }else if(r){ + if(r > 0){ + if(prevw && uc_is_grapheme_break(prevw, wc)){ + break; // starts a new EGC, exit and do not claim + } int cols = wcwidth(wc); - if(cols){ - if(*colcount){ // this must be starting a new EGC, exit and do not claim - break; + if(cols < 0){ + if(iswspace(wc)){ // newline or tab + return ret + 1; } - if(cols < 0){ - if(iswspace(wc)){ // newline or tab - return ret + 1; - } - ret += r; - break; - } - *colcount += cols; + ret += r; + break; } + *colcount += cols; ret += r; gcluster += r; + }else if(r < 0){ + return -1; } + prevw = wc; }while(r); return ret; } diff --git a/tests/cell.cpp b/tests/cell.cpp index c17262d87..9f1af630c 100644 --- a/tests/cell.cpp +++ b/tests/cell.cpp @@ -14,7 +14,7 @@ TEST_CASE("Cell") { SUBCASE("Loadchar") { cell c = CELL_TRIVIAL_INITIALIZER; - REQUIRE(1 == cell_load(n_, &c, " ")); + CHECK(1 == cell_load(n_, &c, " ")); CHECK(cell_simple_p(&c)); cell_release(n_, &c); } @@ -35,7 +35,7 @@ TEST_CASE("Cell") { int dimy, dimx; notcurses_term_dim_yx(nc_, &dimy, &dimx); cell_styles_set(&c, NCSTYLE_ITALIC); - REQUIRE(1 == cell_load(n_, &c, "i")); + CHECK(1 == cell_load(n_, &c, "i")); cell_set_fg_rgb(&c, 255, 255, 255); ncplane_set_base_cell(n_, &c); cell_release(n_, &c); @@ -48,7 +48,7 @@ TEST_CASE("Cell") { int dimy, dimx; notcurses_term_dim_yx(nc_, &dimy, &dimx); cell_styles_set(&c, NCSTYLE_BOLD); - REQUIRE(1 == cell_load(n_, &c, "b")); + CHECK(1 == cell_load(n_, &c, "b")); cell_set_fg_rgb(&c, 255, 255, 255); ncplane_set_base_cell(n_, &c); cell_release(n_, &c); @@ -61,7 +61,7 @@ TEST_CASE("Cell") { int dimy, dimx; notcurses_term_dim_yx(nc_, &dimy, &dimx); cell_styles_set(&c, NCSTYLE_UNDERLINE); - REQUIRE(1 == cell_load(n_, &c, "u")); + CHECK(1 == cell_load(n_, &c, "u")); cell_set_fg_rgb(&c, 255, 255, 255); ncplane_set_base_cell(n_, &c); cell_release(n_, &c); @@ -69,18 +69,18 @@ TEST_CASE("Cell") { cell_styles_off(&c, NCSTYLE_UNDERLINE); } -/* SUBCASE("CellLoadTamil") { - const char zerodeg[] = "\u0bb8\u0bc0\u0bb0\u0bc7\u0bb3\u0b95\u0bbf\u0b95\u0bbf\u0bb0\u0bbf"; - cell c = CELL_TRIVIAL_INITIALIZER; - size_t ulen = cell_load(n_, &c, zerodeg); - // First have U+0BB8 TAMIL LETTER SA U+0BC0 TAMIL VOWEL SIGN II - // // e0 ae b8 e0 af 80 - REQUIRE(6 == ulen); - ulen = cell_load(n_, &c, zerodeg + ulen); - // U+0BB0 TAMIL LETTER RA U+0BCB TAMIL VOWEL SIGN OO - // e0 ae b0 e0 af 8b - REQUIRE(6 == ulen); - // FIXME + /*SUBCASE("CellLoadTamil") { + const char zerodeg[] = "\u0bb8\u0bc0\u0bb0\u0bc7\u0bb3\u0b95\u0bbf\u0b95\u0bbf\u0bb0\u0bbf"; + cell c = CELL_TRIVIAL_INITIALIZER; + size_t ulen = cell_load(n_, &c, zerodeg); + // First have U+0BB8 TAMIL LETTER SA U+0BC0 TAMIL VOWEL SIGN II + // // e0 ae b8 e0 af 80 + CHECK(6 == ulen); + ulen = cell_load(n_, &c, zerodeg + ulen); + // U+0BB0 TAMIL LETTER RA U+0BCB TAMIL VOWEL SIGN OO + // e0 ae b0 e0 af 8b + CHECK(6 == ulen); + // FIXME }*/ SUBCASE("CellSetFGAlpha"){