From 213cd185b51c97f9550436d11eabd5abe5e55881 Mon Sep 17 00:00:00 2001
From: Frank <91616163+softhack007@users.noreply.github.com>
Date: Fri, 21 Nov 2025 13:48:38 +0100
Subject: [PATCH] explanations for CP437 translations, some cleanups, bugfix
 for drawString

* explanation for CP437 glyph groups
* translation for MonnModules symbol
* "smiley" replacement for 4-bytes and overlong unicode codes
* always compile unicodetool.cpp (codepage translation still depends on WLED_ENABLE_UNICODE)
* bugfix: DrawString now skips over glyphs that would be rejected by DrawCharacter
* minor cleanup
---
 wled00/FX_2Dfcn.cpp             | 10 +++---
 wled00/src/font/codepage437.cpp | 56 +++++++++++++++++++++++----------
 wled00/src/font/codepages.h     |  8 +++--
 wled00/src/font/unicodetool.cpp | 19 ++++++-----
 4 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/wled00/FX_2Dfcn.cpp b/wled00/FX_2Dfcn.cpp
index 3bb6dd51..b1f48980 100644
--- a/wled00/FX_2Dfcn.cpp
+++ b/wled00/FX_2Dfcn.cpp
@@ -897,19 +897,21 @@ void Segment::drawText(const unsigned char* text, size_t maxLen, int maxLetters,
     if (utf16_index < WLED_MAX_SEGNAME_LEN) {
       decoded_text[utf16_index] = unicodeToWchar16(now, maxLen);                   // UTF-8 decode into decoded_text
       decoded_text[utf16_index] = wchar16ToCodepage437(decoded_text[utf16_index]); // decoded_text to CP437 (in-place conversion)
-      // toDo: ensure that decoded_text[i] is between console_font_YxZ_first and console_font_YxZ_last
-        // if (chr < 32 || chr > 126) --> clamp chr
-        // chr -= 32; // align with font table entries
-      utf16_index++;
+      if ((decoded_text[utf16_index] >= 1) && ((decoded_text[utf16_index] <= 254))) utf16_index++; // don't advance on NUL or codes not suppoted in DrawCharacter
     }
   }
   decoded_text[utf16_index] = 0; // NUL terminate string
   size_t textLength = min(utf16_index, size_t(maxLetters));
+
 #else
   const unsigned char* decoded_text = text;  // fallback
   size_t textLength = min(strnlen((char*)text, maxLen), size_t(maxLetters));  
 #endif
 
+  // toDo: ensure that decoded_text[i] is between console_font_YxZ_first and console_font_YxZ_last
+    // if (chr < 32 || chr > 126) --> clamp chr
+    // chr -= 32; // align with font table entries
+
   // pass characters to drawCharacter()
   for (int i = 0; i < textLength; i++) {
     SEGMENT.drawCharacter((unsigned char) decoded_text[i], x + w*i, y, w, h, color, col2, drawShadow);
diff --git a/wled00/src/font/codepage437.cpp b/wled00/src/font/codepage437.cpp
index 99ef6b0e..767c992a 100644
--- a/wled00/src/font/codepage437.cpp
+++ b/wled00/src/font/codepage437.cpp
@@ -1,5 +1,12 @@
 #if defined(WLED_ENABLE_FULL_FONTS)
 
+/* 
+   @title     WLED(-MM) - unicode to CP437 conversion
+   @repo      https://github.com/MoonModules/WLED-MM, https://github.com/wled/WLED
+   @Copyright © 2025 Github WLED and WLED-MM Commit Authors (see "git blame" for details)
+   @license   Licensed under the EUPL-1.2 or later
+*/
+
 #include "codepages.h"
 #include <string.h>
 
@@ -9,11 +16,18 @@ constexpr uint8_t CP437_UNKNOWN = 250; // small middle dot · // not sure if we
 // based on a table from https://en.wikipedia.org/wiki/Code_page_437#Character_set
 uint16_t wchar16ToCodepage437(uint16_t wideChar) {
 
-  // codes up to 126 are same as ASCII
-  if (wideChar < 127) return wideChar; 
+  // unicode codes 0 up to 127 are same as ASCII -> pass through
+  if ((wideChar < 0x7F) && (wideChar != 0x08)) return wideChar; // excludes 127 = DEL and 8 = BS, so we can map them
 
   switch (wideChar) {
-    // characters 1 - 31
+    // original IBM PC would interpret codes 0x07, 0x08, 0x0A, and 0x0D as BEL, BS, LF, and CR, respectively. 
+    // we don't implement any special handling at the moment
+    case 0x0008: return 0x08; break; // Backspace: pass through (could be handled differently in future)
+
+    //  unicode codes mapped to characters 1 - 31
+    // unicode 0 = C string terminator -> already passed through, never map it !!
+    // 1 to 31 (0x01 to 0x1F) are "assorted dingbats" (complementary and decorative characters). 
+    // The isolated character 127 (7Fhex) also belongs to this group.
     case 0x263A: return 0x01; break; // ☺︎
     case 0x263B: return 0x02; break; // ☻
     case 0x2665: return 0x03; break; // ♥︎
@@ -46,8 +60,13 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
     case 0x25B2: return 0x1E; break; // ▲
     case 0x25BC: return 0x1F; break; // ▼
 
-    // characters 127 - 254
-    case 0x2302: return 0x7F; break; // ⌂ (aka DEL)
+    //  unicode codes 32 to 126 (0x20 to 0x7E) are the standard ASCII printable characters -> already passed through
+
+    //  unicode codes mapped to characters 127 - 254
+    // code 127 DEL -> small arrow back. We don't implement legacy "rubout" or "backspace" for composing letters or for bold printing
+    case 0x007F: return 0x1B; break; // ←
+    case 0x2302: return 0x7F; break; // ⌂
+    // 128 to 175 (0x80 to 0xAF) are a selection of international text characters
     case 0x00C7: return 0x80; break; // Ç
     case 0x00FC: return 0x81; break; // ü
     case 0x00E9: return 0x82; break; // é
@@ -96,6 +115,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
     case 0x00A1: return 0xAD; break; // ¡
     case 0x00AB: return 0xAE; break; // «
     case 0x00BB: return 0xAF; break; // »
+    // 176 to 223 (0xB0 to 0xDF) are box drawing and block characters
     case 0x2591: return 0xB0; break; // ░
     case 0x2592: return 0xB1; break; // ▒
     case 0x2593: return 0xB2; break; // ▓
@@ -144,6 +164,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
     case 0x258C: return 0xDD; break; // ▌
     case 0x2590: return 0xDE; break; // ▐
     case 0x2580: return 0xDF; break; // ▀
+    // 224 to 235 (0xE0 to 0xEB) are math symbols part 1 - Greek letters commonly used in physics
     case 0x03B1: return 0xE0; break; // α
     case 0x00DF: return 0xE1; break; // ß
     case 0x0393: return 0xE2; break; // Γ
@@ -156,6 +177,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
     case 0x0398: return 0xE9; break; // Θ
     case 0x03A9: return 0xEA; break; // Ω
     case 0x03B4: return 0xEB; break; // δ
+    // 236 to 254 (0xEC to 0xFE) are other common physics and math symbols
     case 0x221E: return 0xEC; break; // ∞
     case 0x03C6: return 0xED; break; // φ
     case 0x03B5: return 0xEE; break; // ε
@@ -174,14 +196,16 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
     case 0x221A: return 0xFB; break; // √
     case 0x207F: return 0xFC; break; // ⁿ
     case 0x00B2: return 0xFD; break; // ²
-    case 0x25A0: return 0xFE; break; // ■
+    case 0x25A0: return 0xFE; break; // ■ geometric shapes
+    // 255 (0xFF) is "non breakable space" (NBSP)
+    case 0x00A0: return 32;   break; // NBSP -> normal "space"
 
-    // special mappings for very similar characters
-    case 0x00A6: return 0x7C; break; // broken bar -> bar
-    case 0x266C: return 14;   break; // musical notes 
-    case 0x0394: return 127;  break; // greek capital delta Δ
+    // special mappings for very similar unicode characters
+    case 0x00A6: return 0x7C; break; // ¦ broken bar -> | bar
+    case 0x266C: return 14;   break; // musical note ♬ -> ♫
+    case 0x0394: return 127;  break; // greek capital delta Δ -> ⌂
     case 0x23AE: return 179;  break; // integral extension ⎮
-    case 0x03B2: return 225;  break; // greek small beta β => sz umlaut ß
+    case 0x03B2: return 225;  break; // greek small beta β -> sz umlaut ß
     case 0x03A0: return 227;  break; // greek capital PI Π
     case 0x220F: return 227;  break; // math product ∏
     case 0x2211: return 228;  break; // math sum ∑
@@ -194,13 +218,13 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
     case 0x2300: return 237;  break; // diameter ⌀
     case 0x00D8: return 237;  break; // 0 strikethrough Ø
     case 0x00F8: return 237;  break; // 0 strikethrough small ø
-    case 0x02DA: return 0xF8; break; // small circle (up) ˚
-    case 0x2208: return 238;  break; // element-of ∈
     case 0x017F: return 244;  break; // long S ſ
+    case 0x02DA: return 0xF8; break; // small circle (up) ˚
 
-    case 0x00A0: return 32;   break; // NBSP => blank
-    case 0x20AC: return 238;  break; // Euro €
-    case 0x2713: return 251;  break; // check mark ✓
+    case 0x2208: return 238;  break; // element-of ∈ -> ε
+    case 0x20AC: return 238;  break; // Euro € -> ε
+    case 0x2713: return 251;  break; // check mark ✓ -> √
+    case 0x263E: return 0x01; break; // ☾ last quarter moon (Moonmodules) -> ☺︎ face
 
     // everything else: unknown
     //default: return 32; // blank
diff --git a/wled00/src/font/codepages.h b/wled00/src/font/codepages.h
index bc0c7538..dbb9e4eb 100644
--- a/wled00/src/font/codepages.h
+++ b/wled00/src/font/codepages.h
@@ -8,9 +8,11 @@
 #undef WLED_ENABLE_FULL_FONTS
 #endif
 
-//constexpr uint16_t UNKNOWN_CODE = 0x2219;  // ∙ multiplication dot
-constexpr uint16_t UNKNOWN_CODE = 0x00B7;   //  · middle dot
-constexpr uint16_t BAD_CODE     = 0x2022;   //  • bigger dot
+// visual replacements when decoding fails
+//constexpr uint16_t UNKNOWN_CODE = 0x2219;  //  ∙ multiplication dot (try this if you don't like the middle dot)
+constexpr uint16_t UNKNOWN_CODE = 0x00B7;    //  · middle dot = unknown code    (generic error)
+constexpr uint16_t BAD_CODE     = 0x2022;    //  • bigger dot = cannot decode   (unicode malformed)
+constexpr uint16_t EXT_CODE     = 0x263B;    //  ☻ smiling face = extended code (unicode not supported)
 
 // UTF‑8 → reduced UTF‑16 decoding
 // translates the next unicode UTF-8 item into a 2-byte "code point"
diff --git a/wled00/src/font/unicodetool.cpp b/wled00/src/font/unicodetool.cpp
index 80312bfa..ff96ab99 100644
--- a/wled00/src/font/unicodetool.cpp
+++ b/wled00/src/font/unicodetool.cpp
@@ -1,4 +1,9 @@
-#if defined(WLED_ENABLE_FULL_FONTS)
+/* 
+   @title     WLED(-MM) - unicode helper functions
+   @repo      https://github.com/MoonModules/WLED-MM, https://github.com/wled/WLED
+   @Copyright © 2025 Github WLED and WLED-MM Commit Authors (see "git blame" for details)
+   @license   Licensed under the EUPL-1.2 or later
+*/
 
 #include "codepages.h"
 #include <string.h>
@@ -35,13 +40,15 @@ uint16_t unicodeToWchar16(const unsigned char* utf8, size_t maxLen) {
       if (length < 3 || !isValidContinuation(utf8[1]) || !isValidContinuation(utf8[2])) return BAD_CODE; // malformed
       codepoint = ((ch0 & 0b00001111) << 12) | ((utf8[1] & 0b00111111) << 6) | (utf8[2] & 0b00111111);
       if (codepoint < 0x800) return UNKNOWN_CODE;                          // Reject overlong encodings (must be >= 0x800)
-      if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return UNKNOWN_CODE; // Reject UTF-16 surrogate pairs (U+D800..U+DFFF)
-      if (codepoint >= 0x010000) codepoint = UNKNOWN_CODE;                 // result exceeds uint16_t => "unknown"
+      if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return  EXT_CODE;    // Reject UTF-16 surrogate pairs (U+D800..U+DFFF)
+      if (codepoint >= 0x010000) codepoint =  EXT_CODE;                    // result exceeds uint16_t (should not happen with well-formed UTF-8)
       return uint16_t(codepoint);
     }
   }
-  // 4-byte sequence or invalid lead byte - since we only support up to 0xFFFF, return error marker
-  return BAD_CODE; // unsupported/invalid
+
+  // since we only support up to 0xFFFF, return error marker
+  if ((ch0 & 0b11111000) == 0b11110000) return EXT_CODE; // unsupported 4-byte sequence
+  else return BAD_CODE;  // other unsupported/invalid
 }
 
 // returns a pointer to the next unicode item - can be used to "advance" conversion after unicodeToWchar16()
@@ -99,5 +106,3 @@ size_t cutUnicodeAt(const unsigned char* utf8, size_t where) {
   if (utf8[where] > 127) where = max(0, int(where)-1);
   return where;
 }
-
-#endif