From acc66372b5ffdeda3878f36d28c79fe8d81103bf Mon Sep 17 00:00:00 2001
From: nick black <dankamongmen@gmail.com>
Date: Sun, 4 Jul 2021 09:11:25 -0400
Subject: [PATCH] add utf8_codepoint_length() #1871

---
 src/lib/egcpool.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/lib/egcpool.h b/src/lib/egcpool.h
index 0f86f53c8..2dccdf248 100644
--- a/src/lib/egcpool.h
+++ b/src/lib/egcpool.h
@@ -60,6 +60,21 @@ egcpool_grow(egcpool* pool, size_t len){
   return 0;
 }
 
+// get the expected length of the encoded codepoint from the first byte of a
+// utf-8 character.
+static inline size_t
+utf8_codepoint_length(unsigned char c){
+  if(c <= 0x7f){        // 0x000000...0x00007f
+    return 1;
+  }else if(c <= 0xc0){  // 0x000080...0x0007ff
+    return 2;
+  }else if(c <= 0xe0){  // 0x000800...0x00ffff
+    return 3;
+  }else{ // c <= 0xf0, 0x100000...0x10ffff
+    return 4;
+  }
+}
+
 // Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
 // libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
 // columns to '*colcount'. Returns the number of bytes consumed, not including