[ansi_scrubber] handle unicode in overstrike code

This commit is contained in:
Tim Stack 2022-08-18 16:09:45 -07:00
parent 9c8cc04a99
commit 5abd483029
7 changed files with 102 additions and 24 deletions

View File

@ -42,8 +42,8 @@
static const pcrepp&
ansi_regex()
{
static const pcrepp retval(
"\x1b\\[([\\d=;\\?]*)([a-zA-Z])|(?:[^\x08]\x08[^\x08])+");
static const pcrepp retval("\x1b\\[([\\d=;\\?]*)([a-zA-Z])|(?:\\X\x08\\X)+",
PCRE_UTF8);
return retval;
}
@ -61,28 +61,35 @@ scrub_ansi_string(std::string& str, string_attrs_t* sa)
while (regex.match(context, pi, PCRE_NO_UTF8_CHECK)) {
auto* caps = context.all();
const auto sf = pi.get_string_fragment(caps);
auto bs_index_res = sf.codepoint_to_byte_index(1);
if (sf.length() >= 3 && sf[1] == '\b') {
if (sf.length() >= 3 && bs_index_res.isOk()
&& sf[bs_index_res.unwrap()] == '\b')
{
ssize_t fill_index = sf.sf_begin;
ssize_t erased_size = (sf.length() / 3) * 2;
ssize_t output_size = sf.length() / 3;
line_range bold_range;
line_range ul_range;
auto sub_sf = sf;
if (sa != nullptr) {
shift_string_attrs(
*sa, caps->c_begin + sf.length() / 3, -erased_size);
sa->emplace_back(line_range{last_origin_offset_end,
caps->c_begin + (int) output_size},
SA_ORIGIN_OFFSET.value(origin_offset));
}
for (ssize_t triple_index = 0; triple_index < output_size;
triple_index++)
{
char lhs = sf[triple_index * 3];
char rhs = sf[triple_index * 3 + 2];
while (!sub_sf.empty()) {
auto lhs_opt = sub_sf.consume_codepoint();
if (!lhs_opt) {
break;
}
auto lhs_pair = lhs_opt.value();
auto mid_opt = lhs_pair.second.consume_codepoint();
if (!mid_opt) {
break;
}
auto mid_pair = mid_opt.value();
auto rhs_opt = mid_pair.second.consume_codepoint();
if (!rhs_opt) {
break;
}
auto rhs_pair = rhs_opt.value();
sub_sf = rhs_pair.second;
if (lhs == '_' || rhs == '_') {
if (lhs_pair.first == '_' || rhs_pair.first == '_') {
if (sa != nullptr && bold_range.is_valid()) {
sa->emplace_back(bold_range,
VC_STYLE.value(text_attrs{A_BOLD}));
@ -94,7 +101,11 @@ scrub_ansi_string(std::string& str, string_attrs_t* sa)
ul_range.lr_start = fill_index;
ul_range.lr_end = fill_index + 1;
}
str[fill_index++] = lhs == '_' ? rhs : lhs;
auto cp = lhs_pair.first == '_' ? rhs_pair.first
: lhs_pair.first;
ww898::utf::utf8::write(cp, [&str, &fill_index](auto ch) {
str[fill_index++] = ch;
});
} else {
if (sa != nullptr && ul_range.is_valid()) {
sa->emplace_back(
@ -107,10 +118,26 @@ scrub_ansi_string(std::string& str, string_attrs_t* sa)
bold_range.lr_start = fill_index;
bold_range.lr_end = fill_index + 1;
}
str[fill_index++] = rhs;
ww898::utf::utf8::write(lhs_pair.first,
[&str, &fill_index](auto ch) {
str[fill_index++] = ch;
});
}
}
auto output_size = fill_index - sf.sf_begin;
auto erased_size = sf.length() - output_size;
if (sa != nullptr) {
#if 0
shift_string_attrs(
*sa, caps->c_begin + sf.length() / 3, -erased_size);
#endif
sa->emplace_back(line_range{last_origin_offset_end,
caps->c_begin + (int) output_size},
SA_ORIGIN_OFFSET.value(origin_offset));
}
if (sa != nullptr && ul_range.is_valid()) {
sa->emplace_back(ul_range,
VC_STYLE.value(text_attrs{A_UNDERLINE}));

View File

@ -132,6 +132,17 @@ struct string_fragment {
char front() const { return this->sf_string[this->sf_begin]; }
uint32_t front_codepoint() const
{
size_t index = 0;
try {
return ww898::utf::utf8::read(
[this, &index]() { return this->data()[index++]; });
} catch (const std::runtime_error& e) {
return this->data()[0];
}
}
char back() const { return this->sf_string[this->sf_end - 1]; }
iterator begin() const { return &this->sf_string[this->sf_begin]; }
@ -140,6 +151,26 @@ struct string_fragment {
bool empty() const { return !this->is_valid() || length() == 0; }
Result<ssize_t, const char*> codepoint_to_byte_index(ssize_t cp_index) const
{
ssize_t retval = 0;
while (cp_index > 0) {
if (retval >= this->length()) {
return Err("index is beyond the end of the string");
}
auto ch_len = TRY(ww898::utf::utf8::char_size([this, retval]() {
return std::make_pair(this->data()[retval],
this->length() - retval - 1);
}));
retval += ch_len;
cp_index -= 1;
}
return Ok(retval);
}
char operator[](int index) const
{
return this->sf_string[sf_begin + index];
@ -276,6 +307,19 @@ struct string_fragment {
.find_right_boundary(0, predicate);
}
nonstd::optional<std::pair<uint32_t, string_fragment>> consume_codepoint()
const
{
auto cp = this->front_codepoint();
auto index_res = this->codepoint_to_byte_index(1);
if (index_res.isErr()) {
return nonstd::nullopt;
}
return std::make_pair(cp, this->substr(index_res.unwrap()));
}
template<typename P>
nonstd::optional<string_fragment> consume(P predicate) const
{

View File

@ -184,6 +184,8 @@ EXPECTED_FILES = \
$(srcdir)/%reldir%/test_cmds.sh_d76d77ad95b9f120825417a6a8220c13df9541fc.out \
$(srcdir)/%reldir%/test_cmds.sh_d7eebacdcf2cb194f25fa4ef97b7b5376b442467.err \
$(srcdir)/%reldir%/test_cmds.sh_d7eebacdcf2cb194f25fa4ef97b7b5376b442467.out \
$(srcdir)/%reldir%/test_cmds.sh_d836c84398c831c976df46f46fe3bf5983c44c37.err \
$(srcdir)/%reldir%/test_cmds.sh_d836c84398c831c976df46f46fe3bf5983c44c37.out \
$(srcdir)/%reldir%/test_cmds.sh_d8eeef53a58bdeddbc1028d7c525413e3ca1c8df.err \
$(srcdir)/%reldir%/test_cmds.sh_d8eeef53a58bdeddbc1028d7c525413e3ca1c8df.out \
$(srcdir)/%reldir%/test_cmds.sh_dbdd62995fdefc8318053af05a32416eccfa79fc.err \

View File

@ -0,0 +1,2 @@
log_top_line() 
51

View File

@ -46,12 +46,15 @@ int
main(int argc, char* argv[])
{
{
std::string boldish = "h\bhe\bel\blo\bo _\ba_\bb_\bc a\b_ b";
std::string boldish
= "\u2022\b\u2022\u2023\b\u2023 h\bhe\bel\blo\bo _\ba_\bb_\bc a\b_ "
"b";
string_attrs_t sa;
sa.clear();
scrub_ansi_string(boldish, &sa);
assert(boldish == "helo abc a b");
printf("boldish %s\n", boldish.c_str());
assert(boldish == "\u2022\u2023 helo abc a b");
for (const auto& attr : sa) {
printf("attr %d:%d %s\n",
attr.sa_range.lr_start,

View File

@ -6,8 +6,8 @@ run_cap_test ${lnav_test} -n \
-c ":switch-to-view help" \
${test_dir}/logfile_access_log.0
run_cap_test ${lnav_test} -n \
-c ":goto 2011-11-02 17:19:39" \
run_cap_test env TZ=UTC ${lnav_test} -n \
-c ":goto 2011-11-03 00:19:39" \
-c ";SELECT log_top_line()" \
${test_dir}/logfile_bro_http.log.0