utf8_egc_len: rewrite using uc_is_grapheme_break() #882

github-action
nick black 4 years ago committed by Nick Black
parent c54d24f7ca
commit 301354a9ba

@ -9,6 +9,7 @@
#include <assert.h> #include <assert.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <unigbrk.h>
#include <stdbool.h> #include <stdbool.h>
#include "notcurses/notcurses.h" #include "notcurses/notcurses.h"
@ -59,43 +60,42 @@ egcpool_grow(egcpool* pool, size_t len){
return 0; return 0;
} }
// Eat an EGC from the UTF-8 string input. This consists of extracting a // Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
// multibyte via mbrtowc, then continuing to extract any which have zero // libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
// width until hitting another spacing character or a NUL terminator. Writes // columns to '*colcount'. Returns the number of bytes consumed, not including
// the number of columns occupied to '*colcount'. Returns the number of bytes // any NUL terminator. Neither the number of bytes nor columns is necessarily
// consumed, not including any NUL terminator. Note that neither the number // equal to the number of decoded code points. Such are the ways of Unicode.
// of bytes nor columns is necessarily equivalent to the number of decoded code // uc_is_grapheme_break() wants UTF-32, which is fine, because we need wchar_t
// points. Such are the ways of Unicode. // to use wcwidth() anyway FIXME except this doesn't work with 16-bit wchar_t!
static inline int static inline int
utf8_egc_len(const char* gcluster, int* colcount){ utf8_egc_len(const char* gcluster, int* colcount){
size_t ret = 0; size_t ret = 0;
*colcount = 0; *colcount = 0;
wchar_t wc;
int r; int r;
mbstate_t mbt; mbstate_t mbt;
memset(&mbt, 0, sizeof(mbt)); memset(&mbt, 0, sizeof(mbt));
wchar_t wc, prevw = 0;
do{ do{
r = mbrtowc(&wc, gcluster, MB_CUR_MAX, &mbt); r = mbrtowc(&wc, gcluster, MB_CUR_MAX, &mbt);
if(r < 0){ if(r > 0){
return -1; if(prevw && uc_is_grapheme_break(prevw, wc)){
}else if(r){ break; // starts a new EGC, exit and do not claim
}
int cols = wcwidth(wc); int cols = wcwidth(wc);
if(cols){ if(cols < 0){
if(*colcount){ // this must be starting a new EGC, exit and do not claim if(iswspace(wc)){ // newline or tab
break; return ret + 1;
} }
if(cols < 0){ ret += r;
if(iswspace(wc)){ // newline or tab break;
return ret + 1;
}
ret += r;
break;
}
*colcount += cols;
} }
*colcount += cols;
ret += r; ret += r;
gcluster += r; gcluster += r;
}else if(r < 0){
return -1;
} }
prevw = wc;
}while(r); }while(r);
return ret; return ret;
} }

@ -14,7 +14,7 @@ TEST_CASE("Cell") {
SUBCASE("Loadchar") { SUBCASE("Loadchar") {
cell c = CELL_TRIVIAL_INITIALIZER; cell c = CELL_TRIVIAL_INITIALIZER;
REQUIRE(1 == cell_load(n_, &c, " ")); CHECK(1 == cell_load(n_, &c, " "));
CHECK(cell_simple_p(&c)); CHECK(cell_simple_p(&c));
cell_release(n_, &c); cell_release(n_, &c);
} }
@ -35,7 +35,7 @@ TEST_CASE("Cell") {
int dimy, dimx; int dimy, dimx;
notcurses_term_dim_yx(nc_, &dimy, &dimx); notcurses_term_dim_yx(nc_, &dimy, &dimx);
cell_styles_set(&c, NCSTYLE_ITALIC); cell_styles_set(&c, NCSTYLE_ITALIC);
REQUIRE(1 == cell_load(n_, &c, "i")); CHECK(1 == cell_load(n_, &c, "i"));
cell_set_fg_rgb(&c, 255, 255, 255); cell_set_fg_rgb(&c, 255, 255, 255);
ncplane_set_base_cell(n_, &c); ncplane_set_base_cell(n_, &c);
cell_release(n_, &c); cell_release(n_, &c);
@ -48,7 +48,7 @@ TEST_CASE("Cell") {
int dimy, dimx; int dimy, dimx;
notcurses_term_dim_yx(nc_, &dimy, &dimx); notcurses_term_dim_yx(nc_, &dimy, &dimx);
cell_styles_set(&c, NCSTYLE_BOLD); cell_styles_set(&c, NCSTYLE_BOLD);
REQUIRE(1 == cell_load(n_, &c, "b")); CHECK(1 == cell_load(n_, &c, "b"));
cell_set_fg_rgb(&c, 255, 255, 255); cell_set_fg_rgb(&c, 255, 255, 255);
ncplane_set_base_cell(n_, &c); ncplane_set_base_cell(n_, &c);
cell_release(n_, &c); cell_release(n_, &c);
@ -61,7 +61,7 @@ TEST_CASE("Cell") {
int dimy, dimx; int dimy, dimx;
notcurses_term_dim_yx(nc_, &dimy, &dimx); notcurses_term_dim_yx(nc_, &dimy, &dimx);
cell_styles_set(&c, NCSTYLE_UNDERLINE); cell_styles_set(&c, NCSTYLE_UNDERLINE);
REQUIRE(1 == cell_load(n_, &c, "u")); CHECK(1 == cell_load(n_, &c, "u"));
cell_set_fg_rgb(&c, 255, 255, 255); cell_set_fg_rgb(&c, 255, 255, 255);
ncplane_set_base_cell(n_, &c); ncplane_set_base_cell(n_, &c);
cell_release(n_, &c); cell_release(n_, &c);
@ -69,18 +69,18 @@ TEST_CASE("Cell") {
cell_styles_off(&c, NCSTYLE_UNDERLINE); cell_styles_off(&c, NCSTYLE_UNDERLINE);
} }
/* SUBCASE("CellLoadTamil") { /*SUBCASE("CellLoadTamil") {
const char zerodeg[] = "\u0bb8\u0bc0\u0bb0\u0bc7\u0bb3\u0b95\u0bbf\u0b95\u0bbf\u0bb0\u0bbf"; const char zerodeg[] = "\u0bb8\u0bc0\u0bb0\u0bc7\u0bb3\u0b95\u0bbf\u0b95\u0bbf\u0bb0\u0bbf";
cell c = CELL_TRIVIAL_INITIALIZER; cell c = CELL_TRIVIAL_INITIALIZER;
size_t ulen = cell_load(n_, &c, zerodeg); size_t ulen = cell_load(n_, &c, zerodeg);
// First have U+0BB8 TAMIL LETTER SA U+0BC0 TAMIL VOWEL SIGN II // First have U+0BB8 TAMIL LETTER SA U+0BC0 TAMIL VOWEL SIGN II
// // e0 ae b8 e0 af 80 // // e0 ae b8 e0 af 80
REQUIRE(6 == ulen); CHECK(6 == ulen);
ulen = cell_load(n_, &c, zerodeg + ulen); ulen = cell_load(n_, &c, zerodeg + ulen);
// U+0BB0 TAMIL LETTER RA U+0BCB TAMIL VOWEL SIGN OO // U+0BB0 TAMIL LETTER RA U+0BCB TAMIL VOWEL SIGN OO
// e0 ae b0 e0 af 8b // e0 ae b0 e0 af 8b
REQUIRE(6 == ulen); CHECK(6 == ulen);
// FIXME // FIXME
}*/ }*/
SUBCASE("CellSetFGAlpha"){ SUBCASE("CellSetFGAlpha"){

Loading…
Cancel
Save