utf8_egc_len: rewrite using uc_is_grapheme_break() #882

4 years ago · 301354a9ba
parent c54d24f7ca
commit 301354a9ba
2 changed files with 38 additions and 38 deletions
--- a/src/lib/egcpool.h
+++ b/src/lib/egcpool.h
@ -9,6 +9,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unigbrk.h>
 #include <stdbool.h>
 #include "notcurses/notcurses.h"

@ -59,43 +60,42 @@ egcpool_grow(egcpool* pool, size_t len){
  return 0;
 }

-// Eat an EGC from the UTF-8 string input. This consists of extracting a
-// multibyte via mbrtowc, then continuing to extract any which have zero
-// width until hitting another spacing character or a NUL terminator. Writes
-// the number of columns occupied to '*colcount'. Returns the number of bytes
-// consumed, not including any NUL terminator. Note that neither the number
-// of bytes nor columns is necessarily equivalent to the number of decoded code
-// points. Such are the ways of Unicode.
+// Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
+// libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
+// columns to '*colcount'. Returns the number of bytes consumed, not including
+// any NUL terminator. Neither the number of bytes nor columns is necessarily
+// equal to the number of decoded code points. Such are the ways of Unicode.
+// uc_is_grapheme_break() wants UTF-32, which is fine, because we need wchar_t
+// to use wcwidth() anyway FIXME except this doesn't work with 16-bit wchar_t!
 static inline int
 utf8_egc_len(const char* gcluster, int* colcount){
  size_t ret = 0;
  *colcount = 0;
-  wchar_t wc;
  int r;
  mbstate_t mbt;
  memset(&mbt, 0, sizeof(mbt));
+  wchar_t wc, prevw = 0;
  do{
    r = mbrtowc(&wc, gcluster, MB_CUR_MAX, &mbt);
-    if(r < 0){
-      return -1;
-    }else if(r){
+    if(r > 0){
+      if(prevw && uc_is_grapheme_break(prevw, wc)){
+        break; // starts a new EGC, exit and do not claim
+      }
      int cols = wcwidth(wc);
-      if(cols){
-        if(*colcount){ // this must be starting a new EGC, exit and do not claim
-          break;
+      if(cols < 0){
+        if(iswspace(wc)){ // newline or tab
+          return ret + 1;
        }
-        if(cols < 0){
-          if(iswspace(wc)){ // newline or tab
-            return ret + 1;
-          }
-          ret += r;
-          break;
-        }
-        *colcount += cols;
+        ret += r;
+        break;
      }
+      *colcount += cols;
      ret += r;
      gcluster += r;
+    }else if(r < 0){
+      return -1;
    }
+    prevw = wc;
  }while(r);
  return ret;
 }
--- a/tests/cell.cpp
+++ b/tests/cell.cpp
@ -14,7 +14,7 @@ TEST_CASE("Cell") {

  SUBCASE("Loadchar") {
    cell c = CELL_TRIVIAL_INITIALIZER;
-    REQUIRE(1 == cell_load(n_, &c, " "));
+    CHECK(1 == cell_load(n_, &c, " "));
    CHECK(cell_simple_p(&c));
    cell_release(n_, &c);
  }
@ -35,7 +35,7 @@ TEST_CASE("Cell") {
    int dimy, dimx;
    notcurses_term_dim_yx(nc_, &dimy, &dimx);
    cell_styles_set(&c, NCSTYLE_ITALIC);
-    REQUIRE(1 == cell_load(n_, &c, "i"));
+    CHECK(1 == cell_load(n_, &c, "i"));
    cell_set_fg_rgb(&c, 255, 255, 255);
    ncplane_set_base_cell(n_, &c);
    cell_release(n_, &c);
@ -48,7 +48,7 @@ TEST_CASE("Cell") {
    int dimy, dimx;
    notcurses_term_dim_yx(nc_, &dimy, &dimx);
    cell_styles_set(&c, NCSTYLE_BOLD);
-    REQUIRE(1 == cell_load(n_, &c, "b"));
+    CHECK(1 == cell_load(n_, &c, "b"));
    cell_set_fg_rgb(&c, 255, 255, 255);
    ncplane_set_base_cell(n_, &c);
    cell_release(n_, &c);
@ -61,7 +61,7 @@ TEST_CASE("Cell") {
    int dimy, dimx;
    notcurses_term_dim_yx(nc_, &dimy, &dimx);
    cell_styles_set(&c, NCSTYLE_UNDERLINE);
-    REQUIRE(1 == cell_load(n_, &c, "u"));
+    CHECK(1 == cell_load(n_, &c, "u"));
    cell_set_fg_rgb(&c, 255, 255, 255);
    ncplane_set_base_cell(n_, &c);
    cell_release(n_, &c);
@ -69,18 +69,18 @@ TEST_CASE("Cell") {
    cell_styles_off(&c, NCSTYLE_UNDERLINE);
  }

-/*  SUBCASE("CellLoadTamil") {
-  const char zerodeg[] = "\u0bb8\u0bc0\u0bb0\u0bc7\u0bb3\u0b95\u0bbf\u0b95\u0bbf\u0bb0\u0bbf";
-  cell c = CELL_TRIVIAL_INITIALIZER;
-  size_t ulen = cell_load(n_, &c, zerodeg);
-  // First have U+0BB8 TAMIL LETTER SA U+0BC0 TAMIL VOWEL SIGN II
-  // // e0 ae b8 e0 af 80
-  REQUIRE(6 == ulen);
-  ulen = cell_load(n_, &c, zerodeg + ulen);
-  // U+0BB0 TAMIL LETTER RA U+0BCB TAMIL VOWEL SIGN OO
-  // e0 ae b0 e0 af 8b
-  REQUIRE(6 == ulen);
-  // FIXME
+  /*SUBCASE("CellLoadTamil") {
+    const char zerodeg[] = "\u0bb8\u0bc0\u0bb0\u0bc7\u0bb3\u0b95\u0bbf\u0b95\u0bbf\u0bb0\u0bbf";
+    cell c = CELL_TRIVIAL_INITIALIZER;
+    size_t ulen = cell_load(n_, &c, zerodeg);
+    // First have U+0BB8 TAMIL LETTER SA U+0BC0 TAMIL VOWEL SIGN II
+    // // e0 ae b8 e0 af 80
+    CHECK(6 == ulen);
+    ulen = cell_load(n_, &c, zerodeg + ulen);
+    // U+0BB0 TAMIL LETTER RA U+0BCB TAMIL VOWEL SIGN OO
+    // e0 ae b0 e0 af 8b
+    CHECK(6 == ulen);
+    // FIXME
  }*/

  SUBCASE("CellSetFGAlpha"){