From 213cd185b51c97f9550436d11eabd5abe5e55881 Mon Sep 17 00:00:00 2001 From: Frank <91616163+softhack007@users.noreply.github.com> Date: Fri, 21 Nov 2025 13:48:38 +0100 Subject: [PATCH] explanations for CP437 translations, some cleanups, bugfix for drawString * explanation for CP437 glyph groups * translation for MonnModules symbol * "smiley" replacement for 4-bytes and overlong unicode codes * always compile unicodetool.cpp (codepage translation still depends on WLED_ENABLE_UNICODE) * bugfix: DrawString now skips over glyphs that would be rejected by DrawCharacter * minor cleanup --- wled00/FX_2Dfcn.cpp | 10 +++--- wled00/src/font/codepage437.cpp | 56 +++++++++++++++++++++++---------- wled00/src/font/codepages.h | 8 +++-- wled00/src/font/unicodetool.cpp | 19 ++++++----- 4 files changed, 63 insertions(+), 30 deletions(-) diff --git a/wled00/FX_2Dfcn.cpp b/wled00/FX_2Dfcn.cpp index 3bb6dd51..b1f48980 100644 --- a/wled00/FX_2Dfcn.cpp +++ b/wled00/FX_2Dfcn.cpp @@ -897,19 +897,21 @@ void Segment::drawText(const unsigned char* text, size_t maxLen, int maxLetters, if (utf16_index < WLED_MAX_SEGNAME_LEN) { decoded_text[utf16_index] = unicodeToWchar16(now, maxLen); // UTF-8 decode into decoded_text decoded_text[utf16_index] = wchar16ToCodepage437(decoded_text[utf16_index]); // decoded_text to CP437 (in-place conversion) - // toDo: ensure that decoded_text[i] is between console_font_YxZ_first and console_font_YxZ_last - // if (chr < 32 || chr > 126) --> clamp chr - // chr -= 32; // align with font table entries - utf16_index++; + if ((decoded_text[utf16_index] >= 1) && ((decoded_text[utf16_index] <= 254))) utf16_index++; // don't advance on NUL or codes not suppoted in DrawCharacter } } decoded_text[utf16_index] = 0; // NUL terminate string size_t textLength = min(utf16_index, size_t(maxLetters)); + #else const unsigned char* decoded_text = text; // fallback size_t textLength = min(strnlen((char*)text, maxLen), size_t(maxLetters)); #endif + // toDo: ensure that decoded_text[i] is between console_font_YxZ_first and console_font_YxZ_last + // if (chr < 32 || chr > 126) --> clamp chr + // chr -= 32; // align with font table entries + // pass characters to drawCharacter() for (int i = 0; i < textLength; i++) { SEGMENT.drawCharacter((unsigned char) decoded_text[i], x + w*i, y, w, h, color, col2, drawShadow); diff --git a/wled00/src/font/codepage437.cpp b/wled00/src/font/codepage437.cpp index 99ef6b0e..767c992a 100644 --- a/wled00/src/font/codepage437.cpp +++ b/wled00/src/font/codepage437.cpp @@ -1,5 +1,12 @@ #if defined(WLED_ENABLE_FULL_FONTS) +/* + @title WLED(-MM) - unicode to CP437 conversion + @repo https://github.com/MoonModules/WLED-MM, https://github.com/wled/WLED + @Copyright © 2025 Github WLED and WLED-MM Commit Authors (see "git blame" for details) + @license Licensed under the EUPL-1.2 or later +*/ + #include "codepages.h" #include @@ -9,11 +16,18 @@ constexpr uint8_t CP437_UNKNOWN = 250; // small middle dot · // not sure if we // based on a table from https://en.wikipedia.org/wiki/Code_page_437#Character_set uint16_t wchar16ToCodepage437(uint16_t wideChar) { - // codes up to 126 are same as ASCII - if (wideChar < 127) return wideChar; + // unicode codes 0 up to 127 are same as ASCII -> pass through + if ((wideChar < 0x7F) && (wideChar != 0x08)) return wideChar; // excludes 127 = DEL and 8 = BS, so we can map them switch (wideChar) { - // characters 1 - 31 + // original IBM PC would interpret codes 0x07, 0x08, 0x0A, and 0x0D as BEL, BS, LF, and CR, respectively. + // we don't implement any special handling at the moment + case 0x0008: return 0x08; break; // Backspace: pass through (could be handled differently in future) + + // unicode codes mapped to characters 1 - 31 + // unicode 0 = C string terminator -> already passed through, never map it !! + // 1 to 31 (0x01 to 0x1F) are "assorted dingbats" (complementary and decorative characters). + // The isolated character 127 (7Fhex) also belongs to this group. case 0x263A: return 0x01; break; // ☺︎ case 0x263B: return 0x02; break; // ☻ case 0x2665: return 0x03; break; // ♥︎ @@ -46,8 +60,13 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) { case 0x25B2: return 0x1E; break; // ▲ case 0x25BC: return 0x1F; break; // ▼ - // characters 127 - 254 - case 0x2302: return 0x7F; break; // ⌂ (aka DEL) + // unicode codes 32 to 126 (0x20 to 0x7E) are the standard ASCII printable characters -> already passed through + + // unicode codes mapped to characters 127 - 254 + // code 127 DEL -> small arrow back. We don't implement legacy "rubout" or "backspace" for composing letters or for bold printing + case 0x007F: return 0x1B; break; // ← + case 0x2302: return 0x7F; break; // ⌂ + // 128 to 175 (0x80 to 0xAF) are a selection of international text characters case 0x00C7: return 0x80; break; // Ç case 0x00FC: return 0x81; break; // ü case 0x00E9: return 0x82; break; // é @@ -96,6 +115,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) { case 0x00A1: return 0xAD; break; // ¡ case 0x00AB: return 0xAE; break; // « case 0x00BB: return 0xAF; break; // » + // 176 to 223 (0xB0 to 0xDF) are box drawing and block characters case 0x2591: return 0xB0; break; // ░ case 0x2592: return 0xB1; break; // ▒ case 0x2593: return 0xB2; break; // ▓ @@ -144,6 +164,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) { case 0x258C: return 0xDD; break; // ▌ case 0x2590: return 0xDE; break; // ▐ case 0x2580: return 0xDF; break; // ▀ + // 224 to 235 (0xE0 to 0xEB) are math symbols part 1 - Greek letters commonly used in physics case 0x03B1: return 0xE0; break; // α case 0x00DF: return 0xE1; break; // ß case 0x0393: return 0xE2; break; // Γ @@ -156,6 +177,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) { case 0x0398: return 0xE9; break; // Θ case 0x03A9: return 0xEA; break; // Ω case 0x03B4: return 0xEB; break; // δ + // 236 to 254 (0xEC to 0xFE) are other common physics and math symbols case 0x221E: return 0xEC; break; // ∞ case 0x03C6: return 0xED; break; // φ case 0x03B5: return 0xEE; break; // ε @@ -174,14 +196,16 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) { case 0x221A: return 0xFB; break; // √ case 0x207F: return 0xFC; break; // ⁿ case 0x00B2: return 0xFD; break; // ² - case 0x25A0: return 0xFE; break; // ■ + case 0x25A0: return 0xFE; break; // ■ geometric shapes + // 255 (0xFF) is "non breakable space" (NBSP) + case 0x00A0: return 32; break; // NBSP -> normal "space" - // special mappings for very similar characters - case 0x00A6: return 0x7C; break; // broken bar -> bar - case 0x266C: return 14; break; // musical notes - case 0x0394: return 127; break; // greek capital delta Δ + // special mappings for very similar unicode characters + case 0x00A6: return 0x7C; break; // ¦ broken bar -> | bar + case 0x266C: return 14; break; // musical note ♬ -> ♫ + case 0x0394: return 127; break; // greek capital delta Δ -> ⌂ case 0x23AE: return 179; break; // integral extension ⎮ - case 0x03B2: return 225; break; // greek small beta β => sz umlaut ß + case 0x03B2: return 225; break; // greek small beta β -> sz umlaut ß case 0x03A0: return 227; break; // greek capital PI Π case 0x220F: return 227; break; // math product ∏ case 0x2211: return 228; break; // math sum ∑ @@ -194,13 +218,13 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) { case 0x2300: return 237; break; // diameter ⌀ case 0x00D8: return 237; break; // 0 strikethrough Ø case 0x00F8: return 237; break; // 0 strikethrough small ø - case 0x02DA: return 0xF8; break; // small circle (up) ˚ - case 0x2208: return 238; break; // element-of ∈ case 0x017F: return 244; break; // long S ſ + case 0x02DA: return 0xF8; break; // small circle (up) ˚ - case 0x00A0: return 32; break; // NBSP => blank - case 0x20AC: return 238; break; // Euro € - case 0x2713: return 251; break; // check mark ✓ + case 0x2208: return 238; break; // element-of ∈ -> ε + case 0x20AC: return 238; break; // Euro € -> ε + case 0x2713: return 251; break; // check mark ✓ -> √ + case 0x263E: return 0x01; break; // ☾ last quarter moon (Moonmodules) -> ☺︎ face // everything else: unknown //default: return 32; // blank diff --git a/wled00/src/font/codepages.h b/wled00/src/font/codepages.h index bc0c7538..dbb9e4eb 100644 --- a/wled00/src/font/codepages.h +++ b/wled00/src/font/codepages.h @@ -8,9 +8,11 @@ #undef WLED_ENABLE_FULL_FONTS #endif -//constexpr uint16_t UNKNOWN_CODE = 0x2219; // ∙ multiplication dot -constexpr uint16_t UNKNOWN_CODE = 0x00B7; // · middle dot -constexpr uint16_t BAD_CODE = 0x2022; // • bigger dot +// visual replacements when decoding fails +//constexpr uint16_t UNKNOWN_CODE = 0x2219; // ∙ multiplication dot (try this if you don't like the middle dot) +constexpr uint16_t UNKNOWN_CODE = 0x00B7; // · middle dot = unknown code (generic error) +constexpr uint16_t BAD_CODE = 0x2022; // • bigger dot = cannot decode (unicode malformed) +constexpr uint16_t EXT_CODE = 0x263B; // ☻ smiling face = extended code (unicode not supported) // UTF‑8 → reduced UTF‑16 decoding // translates the next unicode UTF-8 item into a 2-byte "code point" diff --git a/wled00/src/font/unicodetool.cpp b/wled00/src/font/unicodetool.cpp index 80312bfa..ff96ab99 100644 --- a/wled00/src/font/unicodetool.cpp +++ b/wled00/src/font/unicodetool.cpp @@ -1,4 +1,9 @@ -#if defined(WLED_ENABLE_FULL_FONTS) +/* + @title WLED(-MM) - unicode helper functions + @repo https://github.com/MoonModules/WLED-MM, https://github.com/wled/WLED + @Copyright © 2025 Github WLED and WLED-MM Commit Authors (see "git blame" for details) + @license Licensed under the EUPL-1.2 or later +*/ #include "codepages.h" #include @@ -35,13 +40,15 @@ uint16_t unicodeToWchar16(const unsigned char* utf8, size_t maxLen) { if (length < 3 || !isValidContinuation(utf8[1]) || !isValidContinuation(utf8[2])) return BAD_CODE; // malformed codepoint = ((ch0 & 0b00001111) << 12) | ((utf8[1] & 0b00111111) << 6) | (utf8[2] & 0b00111111); if (codepoint < 0x800) return UNKNOWN_CODE; // Reject overlong encodings (must be >= 0x800) - if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return UNKNOWN_CODE; // Reject UTF-16 surrogate pairs (U+D800..U+DFFF) - if (codepoint >= 0x010000) codepoint = UNKNOWN_CODE; // result exceeds uint16_t => "unknown" + if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return EXT_CODE; // Reject UTF-16 surrogate pairs (U+D800..U+DFFF) + if (codepoint >= 0x010000) codepoint = EXT_CODE; // result exceeds uint16_t (should not happen with well-formed UTF-8) return uint16_t(codepoint); } } - // 4-byte sequence or invalid lead byte - since we only support up to 0xFFFF, return error marker - return BAD_CODE; // unsupported/invalid + + // since we only support up to 0xFFFF, return error marker + if ((ch0 & 0b11111000) == 0b11110000) return EXT_CODE; // unsupported 4-byte sequence + else return BAD_CODE; // other unsupported/invalid } // returns a pointer to the next unicode item - can be used to "advance" conversion after unicodeToWchar16() @@ -99,5 +106,3 @@ size_t cutUnicodeAt(const unsigned char* utf8, size_t where) { if (utf8[where] > 127) where = max(0, int(where)-1); return where; } - -#endif