utf8_codepoint_length: implement Corrigendum#1

Dmitry Sapozhnikov pointed out that I was admitting illegal first bytes; I was also just plain wrong. Bring this up to sync with Corrigendum #1.
3 years ago · e915724b9f
parent 29f1acc479
commit e915724b9f
2 changed files with 10 additions and 5 deletions
--- a/src/info/main.c
+++ b/src/info/main.c
@ -187,7 +187,7 @@ display_logo(const tinfo* ti, struct ncplane* n, const char* path){
  ncplane_yx(n, &y, NULL);
  struct ncvisual_options vopts = {
    .n = n,
-    .y = y + 8, // FIXME broken until #1649 is resolved
+    .y = y + 8,
    .x = 46,
    .blitter = NCBLIT_PIXEL,
    .flags = NCVISUAL_OPTION_CHILDPLANE,
--- a/src/lib/egcpool.h
+++ b/src/lib/egcpool.h
@ -62,17 +62,22 @@ egcpool_grow(egcpool* pool, size_t len){
 }

 // get the expected length of the encoded codepoint from the first byte of a
-// utf-8 character.
+// utf-8 character. if the byte is illegal as a first byte, 1 is returned.
+// Table 3.1B, Legal UTF8 Byte Sequences, Corrigendum #1: UTF-8 Shortest Form
 static inline size_t
 utf8_codepoint_length(unsigned char c){
  if(c <= 0x7f){        // 0x000000...0x00007f
    return 1;
-  }else if(c <= 0xc0){  // 0x000080...0x0007ff
+  }else if(c <= 0xc1){  // illegal continuation byte
+    return 1;
+  }else if(c <= 0xdf){  // 0x000080...0x0007ff
    return 2;
-  }else if(c <= 0xe0){  // 0x000800...0x00ffff
+  }else if(c <= 0xef){  // 0x000800...0x00ffff
    return 3;
-  }else{ // c <= 0xf0, 0x100000...0x10ffff
+  }else if(c <= 0xf4){  // c <= 0xf4, 0x100000...0x10ffff
    return 4;
+  }else{                // illegal first byte
+    return 1;
  }
 }