utf8_egc_len: rewrite using uc_is_grapheme_break() #882

github-action
nick black 4 years ago committed by Nick Black
parent c54d24f7ca
commit 301354a9ba

@ -9,6 +9,7 @@
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <unigbrk.h>
#include <stdbool.h>
#include "notcurses/notcurses.h"
@ -59,43 +60,42 @@ egcpool_grow(egcpool* pool, size_t len){
return 0;
}
// Eat an EGC from the UTF-8 string input. This consists of extracting a
// multibyte via mbrtowc, then continuing to extract any which have zero
// width until hitting another spacing character or a NUL terminator. Writes
// the number of columns occupied to '*colcount'. Returns the number of bytes
// consumed, not including any NUL terminator. Note that neither the number
// of bytes nor columns is necessarily equivalent to the number of decoded code
// points. Such are the ways of Unicode.
// Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
// libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
// columns to '*colcount'. Returns the number of bytes consumed, not including
// any NUL terminator. Neither the number of bytes nor columns is necessarily
// equal to the number of decoded code points. Such are the ways of Unicode.
// uc_is_grapheme_break() wants UTF-32, which is fine, because we need wchar_t
// to use wcwidth() anyway FIXME except this doesn't work with 16-bit wchar_t!
static inline int
utf8_egc_len(const char* gcluster, int* colcount){
size_t ret = 0;
*colcount = 0;
wchar_t wc;
int r;
mbstate_t mbt;
memset(&mbt, 0, sizeof(mbt));
wchar_t wc, prevw = 0;
do{
r = mbrtowc(&wc, gcluster, MB_CUR_MAX, &mbt);
if(r < 0){
return -1;
}else if(r){
if(r > 0){
if(prevw && uc_is_grapheme_break(prevw, wc)){
break; // starts a new EGC, exit and do not claim
}
int cols = wcwidth(wc);
if(cols){
if(*colcount){ // this must be starting a new EGC, exit and do not claim
break;
if(cols < 0){
if(iswspace(wc)){ // newline or tab
return ret + 1;
}
if(cols < 0){
if(iswspace(wc)){ // newline or tab
return ret + 1;
}
ret += r;
break;
}
*colcount += cols;
ret += r;
break;
}
*colcount += cols;
ret += r;
gcluster += r;
}else if(r < 0){
return -1;
}
prevw = wc;
}while(r);
return ret;
}

@ -14,7 +14,7 @@ TEST_CASE("Cell") {
SUBCASE("Loadchar") {
cell c = CELL_TRIVIAL_INITIALIZER;
REQUIRE(1 == cell_load(n_, &c, " "));
CHECK(1 == cell_load(n_, &c, " "));
CHECK(cell_simple_p(&c));
cell_release(n_, &c);
}
@ -35,7 +35,7 @@ TEST_CASE("Cell") {
int dimy, dimx;
notcurses_term_dim_yx(nc_, &dimy, &dimx);
cell_styles_set(&c, NCSTYLE_ITALIC);
REQUIRE(1 == cell_load(n_, &c, "i"));
CHECK(1 == cell_load(n_, &c, "i"));
cell_set_fg_rgb(&c, 255, 255, 255);
ncplane_set_base_cell(n_, &c);
cell_release(n_, &c);
@ -48,7 +48,7 @@ TEST_CASE("Cell") {
int dimy, dimx;
notcurses_term_dim_yx(nc_, &dimy, &dimx);
cell_styles_set(&c, NCSTYLE_BOLD);
REQUIRE(1 == cell_load(n_, &c, "b"));
CHECK(1 == cell_load(n_, &c, "b"));
cell_set_fg_rgb(&c, 255, 255, 255);
ncplane_set_base_cell(n_, &c);
cell_release(n_, &c);
@ -61,7 +61,7 @@ TEST_CASE("Cell") {
int dimy, dimx;
notcurses_term_dim_yx(nc_, &dimy, &dimx);
cell_styles_set(&c, NCSTYLE_UNDERLINE);
REQUIRE(1 == cell_load(n_, &c, "u"));
CHECK(1 == cell_load(n_, &c, "u"));
cell_set_fg_rgb(&c, 255, 255, 255);
ncplane_set_base_cell(n_, &c);
cell_release(n_, &c);
@ -69,18 +69,18 @@ TEST_CASE("Cell") {
cell_styles_off(&c, NCSTYLE_UNDERLINE);
}
/* SUBCASE("CellLoadTamil") {
const char zerodeg[] = "\u0bb8\u0bc0\u0bb0\u0bc7\u0bb3\u0b95\u0bbf\u0b95\u0bbf\u0bb0\u0bbf";
cell c = CELL_TRIVIAL_INITIALIZER;
size_t ulen = cell_load(n_, &c, zerodeg);
// First have U+0BB8 TAMIL LETTER SA U+0BC0 TAMIL VOWEL SIGN II
// // e0 ae b8 e0 af 80
REQUIRE(6 == ulen);
ulen = cell_load(n_, &c, zerodeg + ulen);
// U+0BB0 TAMIL LETTER RA U+0BCB TAMIL VOWEL SIGN OO
// e0 ae b0 e0 af 8b
REQUIRE(6 == ulen);
// FIXME
/*SUBCASE("CellLoadTamil") {
const char zerodeg[] = "\u0bb8\u0bc0\u0bb0\u0bc7\u0bb3\u0b95\u0bbf\u0b95\u0bbf\u0bb0\u0bbf";
cell c = CELL_TRIVIAL_INITIALIZER;
size_t ulen = cell_load(n_, &c, zerodeg);
// First have U+0BB8 TAMIL LETTER SA U+0BC0 TAMIL VOWEL SIGN II
// // e0 ae b8 e0 af 80
CHECK(6 == ulen);
ulen = cell_load(n_, &c, zerodeg + ulen);
// U+0BB0 TAMIL LETTER RA U+0BCB TAMIL VOWEL SIGN OO
// e0 ae b0 e0 af 8b
CHECK(6 == ulen);
// FIXME
}*/
SUBCASE("CellSetFGAlpha"){

Loading…
Cancel
Save