@ -192,19 +192,35 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida
while ( str < = last & & * str ! = ' \0 ' ) {
size_t len = Utf8EncodedCharLen ( * str ) ;
/* If the character is unknown, i.e. encoded length is 0
* we assume worst case for the length check .
* The length check is needed to prevent Utf8Decode to read
* over the terminating ' \0 ' if that happens to be placed
* within the encoding of an UTF8 character . */
if ( ( len = = 0 & & str + 4 > last ) | | str + len > last ) break ;
WChar c ;
len = Utf8Decode ( & c , str ) ;
/* It's possible to encode the string termination character
* into a multiple bytes . This prevents those termination
* characters to be skipped */
if ( c = = ' \0 ' ) break ;
/* If the first byte does not look like the first byte of an encoded
* character , i . e . encoded length is 0 , then this byte is definitely bad
* and it should be skipped .
* When the first byte looks like the first byte of an encoded character ,
* then the remaining bytes in the string are checked whether the whole
* encoded character can be there . If that is not the case , this byte is
* skipped .
* Finally we attempt to decode the encoded character , which does certain
* extra validations to see whether the correct number of bytes were used
* to encode the character . If that is not the case , the byte is probably
* invalid and it is skipped . We could emit a question mark , but then the
* logic below cannot just copy bytes , it would need to re - encode the
* decoded characters as the length in bytes may have changed .
*
* The goals here is to get as much valid Utf8 encoded characters from the
* source string to the destination string .
*
* Note : a multi - byte encoded termination ( ' \0 ' ) will trigger the encoded
* char length and the decoded length to differ , so it will be ignored as
* invalid character data . If it were to reach the termination , then we
* would also reach the " last " byte of the string and a normal ' \0 '
* termination will be placed after it .
*/
if ( len = = 0 | | str + len > last | | len ! = Utf8Decode ( & c , str ) ) {
/* Maybe the next byte is still a valid character? */
str + + ;
continue ;
}
if ( ( IsPrintable ( c ) & & ( c < SCC_SPRITE_START | | c > SCC_SPRITE_END ) ) | | ( ( settings & SVS_ALLOW_CONTROL_CODE ) ! = 0 & & c = = SCC_ENCODED ) ) {
/* Copy the character back. Even if dst is current the same as str
@ -225,6 +241,8 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida
if ( ( settings & SVS_REPLACE_WITH_QUESTION_MARK ) ! = 0 ) * dst + + = ' ? ' ;
}
}
/* String termination, if needed, is left to the caller of this function. */
}
/**