improvement for UTF-8 safe string trucation of segment name
did I mention already that this is complicated stuff? oh yes, it is.
This commit is contained in:
@@ -154,8 +154,12 @@ bool deserializeSegment(JsonObject elem, byte it, byte presetId)
|
||||
if (len > WLED_MAX_SEGNAME_LEN) {
|
||||
len = WLED_MAX_SEGNAME_LEN; // cut to max segment name length
|
||||
#if defined(WLED_ENABLE_FULL_FONTS)
|
||||
if (name[len] > 127) // UTF-8 => don't cut in the middle of a multi-byte char
|
||||
len = cutUnicodeAt((unsigned char*)name, len-1) +1; // +1 to convert between index and length
|
||||
// UTF-8: don't cut in the middle of a multi-byte char
|
||||
// the "or" condition is need because we have to look at both:
|
||||
// * name[len-1] - the character that would be included (at the cut boundary)
|
||||
// * name[len] - the character that would be excluded (after the cut)
|
||||
if ((name[len] > 127) || (name[len-1] > 127))
|
||||
len = cutUnicodeAt((unsigned char*)name, len-1) +1; // find a safe cut // +1 to convert between index and length
|
||||
#endif
|
||||
USER_PRINTF("Segment name too long (%d chars), truncated to \"%.*s\"\n", strlen(name), (int)len, name);
|
||||
}
|
||||
|
||||
@@ -86,12 +86,17 @@ size_t strlenUC(const unsigned char* utf8) {
|
||||
// returns the next (lesser) string index that is safe for cutting an UTF-8 string
|
||||
// Important: calling code is responsible to provide a string with at least _where_ chars
|
||||
size_t cutUnicodeAt(const unsigned char* utf8, size_t where) {
|
||||
if (utf8[where] <= 127) return where; // ASCII
|
||||
size_t loopMin = max(0, int(where)-4); // max 4 characters backwards
|
||||
size_t whereStart = where;
|
||||
while ((isValidContinuation(utf8[where])) && (where > loopMin)) where--; // UTF-8: back until we find a non-continuation char
|
||||
if (where == 0 || utf8[where] <= 127) return where; // ASCII or start -> OK to cut off
|
||||
|
||||
if ((utf8[where] > 127) && isValidContinuation(utf8[whereStart])) where = max(0, int(where)-1); // most likely a UTF-8 lead byte -> go back one step
|
||||
size_t loopMin = max(0, int(where)-5); // max 5 characters backwards (UTF-8 max is 4 bytes)
|
||||
// Back up while we see continuation bytes (10xxxxxx)
|
||||
while ((isValidContinuation(utf8[where])) && (where > loopMin))
|
||||
where--;
|
||||
|
||||
// After the loop, utf8[where] is either ASCII or a UTF-8 lead byte
|
||||
// If it's a lead byte (> 127), we're at the start of a multi-byte sequence.
|
||||
// Go back one more position to exclude the entire sequence.
|
||||
if (utf8[where] > 127) where = max(0, int(where)-1);
|
||||
return where;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user