add utf8_codepoint_length() #1871

This commit is contained in:
nick black 2021-07-04 09:11:25 -04:00
parent 70b8bba715
commit acc66372b5
No known key found for this signature in database
GPG Key ID: 5F43400C21CBFACC

View File

@ -60,6 +60,21 @@ egcpool_grow(egcpool* pool, size_t len){
return 0; return 0;
} }
// get the expected length of the encoded codepoint from the first byte of a
// utf-8 character.
static inline size_t
utf8_codepoint_length(unsigned char c){
if(c <= 0x7f){ // 0x000000...0x00007f
return 1;
}else if(c <= 0xc0){ // 0x000080...0x0007ff
return 2;
}else if(c <= 0xe0){ // 0x000800...0x00ffff
return 3;
}else{ // c <= 0xf0, 0x100000...0x10ffff
return 4;
}
}
// Eat an EGC from the UTF-8 string input, counting bytes and columns. We use // Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
// libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of // libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
// columns to '*colcount'. Returns the number of bytes consumed, not including // columns to '*colcount'. Returns the number of bytes consumed, not including