2019-11-24 17:36:46 +00:00
|
|
|
#ifndef NOTCURSES_EGCPOOL
|
|
|
|
#define NOTCURSES_EGCPOOL
|
|
|
|
|
2019-11-25 18:36:52 +00:00
|
|
|
#include <wchar.h>
|
2019-11-27 19:43:25 +00:00
|
|
|
#include <errno.h>
|
2020-01-30 13:55:01 +00:00
|
|
|
#include <stdio.h>
|
2020-07-15 22:37:54 +00:00
|
|
|
#include <wctype.h>
|
2019-11-24 17:36:46 +00:00
|
|
|
#include <stddef.h>
|
2019-11-27 19:43:25 +00:00
|
|
|
#include <assert.h>
|
2019-11-24 17:36:46 +00:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
2020-09-10 07:31:23 +00:00
|
|
|
#include <unigbrk.h>
|
2019-11-24 17:36:46 +00:00
|
|
|
#include <stdbool.h>
|
2020-02-18 17:36:16 +00:00
|
|
|
#include "notcurses/notcurses.h"
|
2021-07-22 19:36:59 +00:00
|
|
|
#include "compat/compat.h"
|
2021-07-04 14:25:20 +00:00
|
|
|
#include "logging.h"
|
2019-11-24 17:36:46 +00:00
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// cells only provide storage for a single 7-bit character. if there's anything
|
|
|
|
// more than that, it's spilled into the egcpool, and the cell is given an
|
|
|
|
// offset. when a cell is released, the memory it owned is zeroed out, and
|
|
|
|
// recognizable as use for another cell.
|
|
|
|
|
|
|
|
typedef struct egcpool {
|
2019-11-26 02:11:27 +00:00
|
|
|
char* pool; // ringbuffer of attached extension storage
|
|
|
|
int poolsize; // total number of bytes in pool
|
|
|
|
int poolused; // bytes actively used, grow when this gets too large
|
|
|
|
int poolwrite; // next place to *look for* a place to write
|
2019-11-24 17:36:46 +00:00
|
|
|
} egcpool;
|
|
|
|
|
2019-11-27 06:02:05 +00:00
|
|
|
#define POOL_MINIMUM_ALLOC BUFSIZ
|
2021-03-12 05:59:29 +00:00
|
|
|
#define POOL_MAXIMUM_BYTES (1u << 24u) // max 16MiB
|
2019-11-27 06:02:05 +00:00
|
|
|
|
2019-11-24 17:36:46 +00:00
|
|
|
static inline void
|
|
|
|
egcpool_init(egcpool* p){
|
|
|
|
memset(p, 0, sizeof(*p));
|
|
|
|
}
|
|
|
|
|
2019-11-27 06:02:05 +00:00
|
|
|
static inline int
|
|
|
|
egcpool_grow(egcpool* pool, size_t len){
|
|
|
|
size_t newsize = pool->poolsize * 2;
|
|
|
|
if(newsize < POOL_MINIMUM_ALLOC){
|
|
|
|
newsize = POOL_MINIMUM_ALLOC;
|
|
|
|
}
|
|
|
|
while(len > newsize - pool->poolsize){ // ensure we make enough space
|
|
|
|
newsize *= 2;
|
|
|
|
}
|
2020-04-16 03:49:10 +00:00
|
|
|
if(newsize > POOL_MAXIMUM_BYTES){
|
2019-11-27 06:02:05 +00:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
// nasty cast here because c++ source might include this header :/
|
2019-12-03 18:31:04 +00:00
|
|
|
char* tmp = (char*)realloc(pool->pool, newsize);
|
2019-11-27 06:02:05 +00:00
|
|
|
if(tmp == NULL){
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
pool->pool = tmp;
|
|
|
|
memset(pool->pool + pool->poolsize, 0, newsize - pool->poolsize);
|
|
|
|
pool->poolsize = newsize;
|
|
|
|
return 0;
|
|
|
|
}
|
2019-11-24 17:36:46 +00:00
|
|
|
|
2021-07-04 13:11:25 +00:00
|
|
|
// get the expected length of the encoded codepoint from the first byte of a
|
2021-07-04 19:54:13 +00:00
|
|
|
// utf-8 character. if the byte is illegal as a first byte, 1 is returned.
|
|
|
|
// Table 3.1B, Legal UTF8 Byte Sequences, Corrigendum #1: UTF-8 Shortest Form
|
2021-07-04 13:11:25 +00:00
|
|
|
static inline size_t
|
|
|
|
utf8_codepoint_length(unsigned char c){
|
|
|
|
if(c <= 0x7f){ // 0x000000...0x00007f
|
|
|
|
return 1;
|
2021-07-04 19:54:13 +00:00
|
|
|
}else if(c <= 0xc1){ // illegal continuation byte
|
|
|
|
return 1;
|
|
|
|
}else if(c <= 0xdf){ // 0x000080...0x0007ff
|
2021-07-04 13:11:25 +00:00
|
|
|
return 2;
|
2021-07-04 19:54:13 +00:00
|
|
|
}else if(c <= 0xef){ // 0x000800...0x00ffff
|
2021-07-04 13:11:25 +00:00
|
|
|
return 3;
|
2021-07-04 19:54:13 +00:00
|
|
|
}else if(c <= 0xf4){ // c <= 0xf4, 0x100000...0x10ffff
|
2021-07-04 13:11:25 +00:00
|
|
|
return 4;
|
2021-07-04 19:54:13 +00:00
|
|
|
}else{ // illegal first byte
|
|
|
|
return 1;
|
2021-07-04 13:11:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-10 07:31:23 +00:00
|
|
|
// Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
|
|
|
|
// libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
|
|
|
|
// columns to '*colcount'. Returns the number of bytes consumed, not including
|
|
|
|
// any NUL terminator. Neither the number of bytes nor columns is necessarily
|
|
|
|
// equal to the number of decoded code points. Such are the ways of Unicode.
|
|
|
|
// uc_is_grapheme_break() wants UTF-32, which is fine, because we need wchar_t
|
2021-01-03 23:41:26 +00:00
|
|
|
// to use wcwidth() anyway FIXME except this doesn't work with 16-bit wchar_t!
|
2019-11-26 02:11:27 +00:00
|
|
|
static inline int
|
|
|
|
utf8_egc_len(const char* gcluster, int* colcount){
|
2019-11-25 18:36:52 +00:00
|
|
|
size_t ret = 0;
|
|
|
|
*colcount = 0;
|
|
|
|
int r;
|
2019-11-27 15:43:03 +00:00
|
|
|
mbstate_t mbt;
|
|
|
|
memset(&mbt, 0, sizeof(mbt));
|
2020-09-10 07:31:23 +00:00
|
|
|
wchar_t wc, prevw = 0;
|
2021-08-02 13:00:01 +00:00
|
|
|
bool injoin = false;
|
2019-11-25 18:36:52 +00:00
|
|
|
do{
|
2019-11-27 15:43:03 +00:00
|
|
|
r = mbrtowc(&wc, gcluster, MB_CUR_MAX, &mbt);
|
2020-12-05 07:18:05 +00:00
|
|
|
if(r < 0){
|
2021-07-04 14:25:20 +00:00
|
|
|
// FIXME probably ought escape this somehow
|
|
|
|
logerror("Invalid UTF8: %s\n", gcluster);
|
2020-12-05 07:18:05 +00:00
|
|
|
return -1;
|
|
|
|
}
|
2021-08-02 13:00:01 +00:00
|
|
|
if(prevw && !injoin && uc_is_grapheme_break(prevw, wc)){
|
2020-12-05 07:18:05 +00:00
|
|
|
break; // starts a new EGC, exit and do not claim
|
|
|
|
}
|
2021-01-03 23:41:26 +00:00
|
|
|
int cols = wcwidth(wc);
|
2020-12-05 07:18:05 +00:00
|
|
|
if(cols < 0){
|
|
|
|
if(iswspace(wc)){ // newline or tab
|
|
|
|
return ret + 1;
|
2020-09-15 04:29:53 +00:00
|
|
|
}
|
2021-07-04 14:25:20 +00:00
|
|
|
logerror("Prohibited or invalid Unicode: 0x%x\n", wc);
|
2020-09-10 07:31:23 +00:00
|
|
|
return -1;
|
2019-11-25 18:36:52 +00:00
|
|
|
}
|
2021-08-02 13:00:01 +00:00
|
|
|
injoin = (wc == L'\u200d');
|
2020-12-12 11:28:17 +00:00
|
|
|
*colcount += cols;
|
2020-12-05 07:18:05 +00:00
|
|
|
ret += r;
|
|
|
|
gcluster += r;
|
2020-09-10 07:31:23 +00:00
|
|
|
prevw = wc;
|
2019-11-25 18:36:52 +00:00
|
|
|
}while(r);
|
|
|
|
return ret;
|
2019-11-24 19:00:48 +00:00
|
|
|
}
|
|
|
|
|
2019-11-24 20:26:01 +00:00
|
|
|
// if we're inserting a EGC of |len| bytes, ought we proactively realloc?
|
|
|
|
static inline bool
|
2019-11-26 02:11:27 +00:00
|
|
|
egcpool_alloc_justified(const egcpool* pool, int len){
|
|
|
|
const int poolfree = pool->poolsize - pool->poolused;
|
2019-11-24 20:26:01 +00:00
|
|
|
// proactively get more space if we have less than 10% free. this doesn't
|
|
|
|
// guarantee that we'll have enough space to insert the string -- we could
|
|
|
|
// theoretically have every 10th byte free, and be unable to write even a
|
|
|
|
// two-byte egc -- so we might have to allocate after an expensive search :/.
|
|
|
|
if(poolfree >= len && poolfree * 10 > pool->poolsize){
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-11-24 17:36:46 +00:00
|
|
|
// stash away the provided UTF8, NUL-terminated grapheme cluster. the cluster
|
|
|
|
// should not be less than 2 bytes (such a cluster should be directly stored in
|
2020-04-16 03:49:10 +00:00
|
|
|
// the cell). returns -1 on error, and otherwise a non-negative offset. 'ulen'
|
|
|
|
// must be the number of bytes to lift from egc (utf8_egc_len()).
|
2020-05-22 01:10:04 +00:00
|
|
|
__attribute__ ((nonnull (1, 2))) static inline int
|
2019-11-27 15:43:03 +00:00
|
|
|
egcpool_stash(egcpool* pool, const char* egc, size_t ulen){
|
|
|
|
int len = ulen + 1; // count the NUL terminator
|
2019-11-24 17:36:46 +00:00
|
|
|
if(len <= 2){ // should never be empty, nor a single byte + NUL
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
// the first time through, we don't force a grow unless we expect ourselves
|
|
|
|
// to have too little space. once we've done a search, we do force the grow.
|
|
|
|
// we should thus never have more than two iterations of this loop.
|
|
|
|
bool searched = false;
|
2019-11-24 20:26:01 +00:00
|
|
|
// we might have to realloc our underlying pool. it is possible that this EGC
|
|
|
|
// is actually *in* that pool, in which case our pointer will be invalidated.
|
|
|
|
// to be safe, duplicate prior to a realloc, and free along all paths.
|
|
|
|
char* duplicated = NULL;
|
2019-11-24 17:36:46 +00:00
|
|
|
do{
|
2019-11-24 20:26:01 +00:00
|
|
|
if(egcpool_alloc_justified(pool, len) || searched){
|
|
|
|
if(!duplicated){
|
2020-07-20 07:14:02 +00:00
|
|
|
if((duplicated = strndup(egc, ulen)) == NULL){
|
2020-06-03 08:44:13 +00:00
|
|
|
return -1;
|
|
|
|
}
|
2019-11-24 20:26:01 +00:00
|
|
|
}
|
2020-04-15 20:33:22 +00:00
|
|
|
if(egcpool_grow(pool, len) && searched){
|
2019-11-24 20:26:01 +00:00
|
|
|
free(duplicated);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
egc = duplicated;
|
2019-11-24 17:36:46 +00:00
|
|
|
}
|
|
|
|
// we now look for a place to lay out this egc. we need |len| zeroes in a
|
|
|
|
// row. starting at pool->poolwrite, look for such a range of unused
|
|
|
|
// memory. if we find it, write it out, and update used count. if we come
|
|
|
|
// back to where we started, force a growth and try again.
|
2019-11-26 02:11:27 +00:00
|
|
|
int curpos = pool->poolwrite;
|
2020-04-15 20:33:22 +00:00
|
|
|
//fprintf(stderr, "Stashing [%s] %d starting at %d\n", egc, len, curpos);
|
2019-11-24 17:36:46 +00:00
|
|
|
do{
|
|
|
|
if(curpos == pool->poolsize){
|
|
|
|
curpos = 0;
|
|
|
|
}
|
|
|
|
if(pool->pool[curpos]){ // can't write if there's stuff here
|
|
|
|
++curpos;
|
2019-12-01 07:21:13 +00:00
|
|
|
}else if(curpos && pool->pool[curpos - 1]){ // don't kill someone's NUL
|
|
|
|
++curpos;
|
2019-11-24 20:56:22 +00:00
|
|
|
}else if(pool->poolsize - curpos < len){ // can't wrap around
|
|
|
|
if(pool->poolwrite > curpos){
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
curpos = 0; // can this skip pool->poolwrite?
|
2019-11-24 17:36:46 +00:00
|
|
|
}else{ // promising! let's see if there's enough space
|
2019-11-26 02:11:27 +00:00
|
|
|
int need = len;
|
2019-11-24 17:36:46 +00:00
|
|
|
size_t trial = curpos;
|
|
|
|
while(--need){
|
2019-11-24 20:56:22 +00:00
|
|
|
if(pool->pool[++trial]){ // alas, not enough space here
|
2019-11-24 17:36:46 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(need == 0){ // found a suitable space, copy it!
|
2019-11-24 20:56:22 +00:00
|
|
|
memcpy(pool->pool + curpos, egc, len - 1);
|
|
|
|
pool->pool[curpos + len - 1] = '\0';
|
2019-12-01 07:21:13 +00:00
|
|
|
pool->poolwrite = curpos + len;
|
2019-11-24 17:36:46 +00:00
|
|
|
pool->poolused += len;
|
2019-11-24 20:26:01 +00:00
|
|
|
free(duplicated);
|
2020-04-15 20:33:22 +00:00
|
|
|
//fprintf(stderr, "Stashing AT %d\n", curpos);
|
2019-11-24 17:36:46 +00:00
|
|
|
return curpos;
|
|
|
|
}
|
2019-11-24 20:56:22 +00:00
|
|
|
if(pool->poolwrite > curpos && pool->poolwrite - (len - need) < curpos){
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
curpos += len - need;
|
2019-11-24 17:36:46 +00:00
|
|
|
}
|
2019-11-26 02:11:27 +00:00
|
|
|
}while(curpos != pool->poolwrite);
|
2019-11-24 17:36:46 +00:00
|
|
|
}while( (searched = !searched) );
|
2019-11-24 20:26:01 +00:00
|
|
|
free(duplicated);
|
2019-12-01 07:21:13 +00:00
|
|
|
assert(false);
|
2019-11-24 17:36:46 +00:00
|
|
|
return -1; // should never get here
|
|
|
|
}
|
|
|
|
|
2019-11-27 19:43:25 +00:00
|
|
|
// Run a consistency check on the offset; ensure it's a valid, non-empty EGC.
|
|
|
|
static inline bool
|
|
|
|
egcpool_check_validity(const egcpool* pool, int offset){
|
2019-12-01 03:53:24 +00:00
|
|
|
if(offset >= pool->poolsize){
|
2019-12-01 07:21:13 +00:00
|
|
|
fprintf(stderr, "Offset 0x%06x greater than size (%d)\n", offset, pool->poolsize);
|
2019-12-01 03:53:24 +00:00
|
|
|
return false;
|
|
|
|
}
|
2019-11-27 19:43:25 +00:00
|
|
|
const char* egc = pool->pool + offset;
|
|
|
|
if(*egc == '\0'){
|
2019-12-01 07:21:13 +00:00
|
|
|
fprintf(stderr, "Bad offset 0x%06x: empty\n", offset);
|
2019-11-27 19:43:25 +00:00
|
|
|
return false;
|
|
|
|
}
|
2020-02-01 22:21:13 +00:00
|
|
|
mbstate_t mbstate;
|
|
|
|
memset(&mbstate, 0, sizeof(mbstate));
|
2019-11-27 19:43:25 +00:00
|
|
|
do{
|
|
|
|
wchar_t wcs;
|
2020-02-01 22:21:13 +00:00
|
|
|
int r = mbrtowc(&wcs, egc, strlen(egc), &mbstate);
|
2019-11-27 19:43:25 +00:00
|
|
|
if(r < 0){
|
2021-07-22 19:36:59 +00:00
|
|
|
fprintf(stderr, "Invalid UTF8 at offset 0x%06x [%s]\n", offset, strerror(errno));
|
2019-11-27 19:43:25 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
egc += r;
|
|
|
|
}while(*egc);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-11-24 17:36:46 +00:00
|
|
|
// remove the egc from the pool. start at offset, and zero out everything until
|
|
|
|
// we find a zero (our own NUL terminator). remove that number of bytes from
|
|
|
|
// the usedcount.
|
|
|
|
static inline void
|
2019-11-26 02:11:27 +00:00
|
|
|
egcpool_release(egcpool* pool, int offset){
|
2019-11-24 17:36:46 +00:00
|
|
|
size_t freed = 1; // account for free(d) NUL terminator
|
|
|
|
while(pool->pool[offset]){
|
|
|
|
pool->pool[offset] = '\0';
|
|
|
|
++freed;
|
2019-12-01 07:21:13 +00:00
|
|
|
++offset;
|
|
|
|
assert(offset < pool->poolsize);
|
2019-11-24 17:36:46 +00:00
|
|
|
}
|
|
|
|
pool->poolused -= freed;
|
|
|
|
// FIXME ought we update pool->poolwrite?
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
egcpool_dump(egcpool* pool){
|
|
|
|
free(pool->pool);
|
2019-11-26 00:23:54 +00:00
|
|
|
pool->pool = NULL;
|
2019-11-24 17:36:46 +00:00
|
|
|
pool->poolsize = 0;
|
|
|
|
pool->poolwrite = 0;
|
2019-11-26 00:23:54 +00:00
|
|
|
pool->poolused = 0;
|
2019-11-24 17:36:46 +00:00
|
|
|
}
|
|
|
|
|
2020-08-03 03:45:58 +00:00
|
|
|
// get the offset into the egcpool for this cell's EGC. returns meaningless and
|
|
|
|
// unsafe results if called on a simple cell.
|
|
|
|
static inline uint32_t
|
2020-12-13 05:19:24 +00:00
|
|
|
cell_egc_idx(const nccell* c){
|
2020-12-09 01:42:09 +00:00
|
|
|
return (htole(c->gcluster) & 0x00fffffflu);
|
2020-08-03 03:45:58 +00:00
|
|
|
}
|
|
|
|
|
2021-03-15 03:43:21 +00:00
|
|
|
// Is the cell a spilled (more than 4 byte) UTF8 EGC?
|
|
|
|
static inline bool
|
|
|
|
cell_extended_p(const nccell* c){
|
2021-06-29 12:56:26 +00:00
|
|
|
return (htole(c->gcluster) & 0xff000000ul) == 0x01000000ul;
|
2021-03-15 03:43:21 +00:00
|
|
|
}
|
|
|
|
|
2020-08-21 09:47:53 +00:00
|
|
|
// Is the cell simple (a UTF8-encoded EGC of four bytes or fewer)?
|
|
|
|
static inline bool
|
2020-12-13 05:19:24 +00:00
|
|
|
cell_simple_p(const nccell* c){
|
2021-05-14 05:39:04 +00:00
|
|
|
return !cell_extended_p(c);
|
2020-08-21 09:47:53 +00:00
|
|
|
}
|
|
|
|
|
2020-08-14 08:56:11 +00:00
|
|
|
// only applies to complex cells, do not use on simple cells
|
2020-05-22 01:10:04 +00:00
|
|
|
__attribute__ ((__returns_nonnull__)) static inline const char*
|
2020-12-13 05:19:24 +00:00
|
|
|
egcpool_extended_gcluster(const egcpool* pool, const nccell* c) {
|
2021-04-24 10:35:31 +00:00
|
|
|
assert(cell_extended_p(c));
|
2019-12-22 13:08:53 +00:00
|
|
|
uint32_t idx = cell_egc_idx(c);
|
|
|
|
return pool->pool + idx;
|
|
|
|
}
|
|
|
|
|
2020-01-15 17:22:10 +00:00
|
|
|
// Duplicate the contents of EGCpool 'src' onto another, wiping out any prior
|
|
|
|
// contents in 'dst'.
|
|
|
|
static inline int
|
|
|
|
egcpool_dup(egcpool* dst, const egcpool* src){
|
|
|
|
char* tmp;
|
|
|
|
if((tmp = (char*)realloc(dst->pool, src->poolsize)) == NULL){
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
dst->pool = tmp;
|
|
|
|
dst->poolsize = src->poolsize;
|
|
|
|
dst->poolused = src->poolused;
|
|
|
|
dst->poolwrite = src->poolwrite;
|
|
|
|
memcpy(dst->pool, src->pool, src->poolsize);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-11-24 17:36:46 +00:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif
|