explanations for CP437 translations, some cleanups, bugfix for drawString

* explanation for CP437 glyph groups
* translation for MonnModules symbol
* "smiley" replacement for 4-bytes and overlong unicode codes
* always compile unicodetool.cpp (codepage translation still depends on WLED_ENABLE_UNICODE)
* bugfix: DrawString now skips over glyphs that would be rejected by DrawCharacter
* minor cleanup
This commit is contained in:
Frank
2025-11-21 13:48:38 +01:00
parent 527acefdea
commit 213cd185b5
4 changed files with 63 additions and 30 deletions

View File

@@ -897,19 +897,21 @@ void Segment::drawText(const unsigned char* text, size_t maxLen, int maxLetters,
if (utf16_index < WLED_MAX_SEGNAME_LEN) {
decoded_text[utf16_index] = unicodeToWchar16(now, maxLen); // UTF-8 decode into decoded_text
decoded_text[utf16_index] = wchar16ToCodepage437(decoded_text[utf16_index]); // decoded_text to CP437 (in-place conversion)
// toDo: ensure that decoded_text[i] is between console_font_YxZ_first and console_font_YxZ_last
// if (chr < 32 || chr > 126) --> clamp chr
// chr -= 32; // align with font table entries
utf16_index++;
if ((decoded_text[utf16_index] >= 1) && ((decoded_text[utf16_index] <= 254))) utf16_index++; // don't advance on NUL or codes not suppoted in DrawCharacter
}
}
decoded_text[utf16_index] = 0; // NUL terminate string
size_t textLength = min(utf16_index, size_t(maxLetters));
#else
const unsigned char* decoded_text = text; // fallback
size_t textLength = min(strnlen((char*)text, maxLen), size_t(maxLetters));
#endif
// toDo: ensure that decoded_text[i] is between console_font_YxZ_first and console_font_YxZ_last
// if (chr < 32 || chr > 126) --> clamp chr
// chr -= 32; // align with font table entries
// pass characters to drawCharacter()
for (int i = 0; i < textLength; i++) {
SEGMENT.drawCharacter((unsigned char) decoded_text[i], x + w*i, y, w, h, color, col2, drawShadow);

View File

@@ -1,5 +1,12 @@
#if defined(WLED_ENABLE_FULL_FONTS)
/*
@title WLED(-MM) - unicode to CP437 conversion
@repo https://github.com/MoonModules/WLED-MM, https://github.com/wled/WLED
@Copyright © 2025 Github WLED and WLED-MM Commit Authors (see "git blame" for details)
@license Licensed under the EUPL-1.2 or later
*/
#include "codepages.h"
#include <string.h>
@@ -9,11 +16,18 @@ constexpr uint8_t CP437_UNKNOWN = 250; // small middle dot · // not sure if we
// based on a table from https://en.wikipedia.org/wiki/Code_page_437#Character_set
uint16_t wchar16ToCodepage437(uint16_t wideChar) {
// codes up to 126 are same as ASCII
if (wideChar < 127) return wideChar;
// unicode codes 0 up to 127 are same as ASCII -> pass through
if ((wideChar < 0x7F) && (wideChar != 0x08)) return wideChar; // excludes 127 = DEL and 8 = BS, so we can map them
switch (wideChar) {
// characters 1 - 31
// original IBM PC would interpret codes 0x07, 0x08, 0x0A, and 0x0D as BEL, BS, LF, and CR, respectively.
// we don't implement any special handling at the moment
case 0x0008: return 0x08; break; // Backspace: pass through (could be handled differently in future)
// unicode codes mapped to characters 1 - 31
// unicode 0 = C string terminator -> already passed through, never map it !!
// 1 to 31 (0x01 to 0x1F) are "assorted dingbats" (complementary and decorative characters).
// The isolated character 127 (7Fhex) also belongs to this group.
case 0x263A: return 0x01; break; // ☺︎
case 0x263B: return 0x02; break; // ☻
case 0x2665: return 0x03; break; // ♥︎
@@ -46,8 +60,13 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
case 0x25B2: return 0x1E; break; // ▲
case 0x25BC: return 0x1F; break; // ▼
// characters 127 - 254
case 0x2302: return 0x7F; break; // ⌂ (aka DEL)
// unicode codes 32 to 126 (0x20 to 0x7E) are the standard ASCII printable characters -> already passed through
// unicode codes mapped to characters 127 - 254
// code 127 DEL -> small arrow back. We don't implement legacy "rubout" or "backspace" for composing letters or for bold printing
case 0x007F: return 0x1B; break; // ←
case 0x2302: return 0x7F; break; // ⌂
// 128 to 175 (0x80 to 0xAF) are a selection of international text characters
case 0x00C7: return 0x80; break; // Ç
case 0x00FC: return 0x81; break; // ü
case 0x00E9: return 0x82; break; // é
@@ -96,6 +115,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
case 0x00A1: return 0xAD; break; // ¡
case 0x00AB: return 0xAE; break; // «
case 0x00BB: return 0xAF; break; // »
// 176 to 223 (0xB0 to 0xDF) are box drawing and block characters
case 0x2591: return 0xB0; break; // ░
case 0x2592: return 0xB1; break; // ▒
case 0x2593: return 0xB2; break; // ▓
@@ -144,6 +164,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
case 0x258C: return 0xDD; break; // ▌
case 0x2590: return 0xDE; break; // ▐
case 0x2580: return 0xDF; break; // ▀
// 224 to 235 (0xE0 to 0xEB) are math symbols part 1 - Greek letters commonly used in physics
case 0x03B1: return 0xE0; break; // α
case 0x00DF: return 0xE1; break; // ß
case 0x0393: return 0xE2; break; // Γ
@@ -156,6 +177,7 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
case 0x0398: return 0xE9; break; // Θ
case 0x03A9: return 0xEA; break; // Ω
case 0x03B4: return 0xEB; break; // δ
// 236 to 254 (0xEC to 0xFE) are other common physics and math symbols
case 0x221E: return 0xEC; break; // ∞
case 0x03C6: return 0xED; break; // φ
case 0x03B5: return 0xEE; break; // ε
@@ -174,14 +196,16 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
case 0x221A: return 0xFB; break; // √
case 0x207F: return 0xFC; break; // ⁿ
case 0x00B2: return 0xFD; break; // ²
case 0x25A0: return 0xFE; break; // ■
case 0x25A0: return 0xFE; break; // ■ geometric shapes
// 255 (0xFF) is "non breakable space" (NBSP)
case 0x00A0: return 32; break; // NBSP -> normal "space"
// special mappings for very similar characters
case 0x00A6: return 0x7C; break; // broken bar -> bar
case 0x266C: return 14; break; // musical notes
case 0x0394: return 127; break; // greek capital delta Δ
// special mappings for very similar unicode characters
case 0x00A6: return 0x7C; break; // ¦ broken bar -> | bar
case 0x266C: return 14; break; // musical note ♬ -> ♫
case 0x0394: return 127; break; // greek capital delta Δ -> ⌂
case 0x23AE: return 179; break; // integral extension ⎮
case 0x03B2: return 225; break; // greek small beta β => sz umlaut ß
case 0x03B2: return 225; break; // greek small beta β -> sz umlaut ß
case 0x03A0: return 227; break; // greek capital PI Π
case 0x220F: return 227; break; // math product ∏
case 0x2211: return 228; break; // math sum ∑
@@ -194,13 +218,13 @@ uint16_t wchar16ToCodepage437(uint16_t wideChar) {
case 0x2300: return 237; break; // diameter ⌀
case 0x00D8: return 237; break; // 0 strikethrough Ø
case 0x00F8: return 237; break; // 0 strikethrough small ø
case 0x02DA: return 0xF8; break; // small circle (up) ˚
case 0x2208: return 238; break; // element-of ∈
case 0x017F: return 244; break; // long S ſ
case 0x02DA: return 0xF8; break; // small circle (up) ˚
case 0x00A0: return 32; break; // NBSP => blank
case 0x20AC: return 238; break; // Euro €
case 0x2713: return 251; break; // check mark ✓
case 0x2208: return 238; break; // element-of ∈ -> ε
case 0x20AC: return 238; break; // Euro € -> ε
case 0x2713: return 251; break; // check mark ✓ -> √
case 0x263E: return 0x01; break; // ☾ last quarter moon (Moonmodules) -> ☺︎ face
// everything else: unknown
//default: return 32; // blank

View File

@@ -8,9 +8,11 @@
#undef WLED_ENABLE_FULL_FONTS
#endif
//constexpr uint16_t UNKNOWN_CODE = 0x2219; // ∙ multiplication dot
constexpr uint16_t UNKNOWN_CODE = 0x00B7; // · middle dot
constexpr uint16_t BAD_CODE = 0x2022; // • bigger dot
// visual replacements when decoding fails
//constexpr uint16_t UNKNOWN_CODE = 0x2219; // ∙ multiplication dot (try this if you don't like the middle dot)
constexpr uint16_t UNKNOWN_CODE = 0x00B7; // · middle dot = unknown code (generic error)
constexpr uint16_t BAD_CODE = 0x2022; // • bigger dot = cannot decode (unicode malformed)
constexpr uint16_t EXT_CODE = 0x263B; // ☻ smiling face = extended code (unicode not supported)
// UTF8 → reduced UTF16 decoding
// translates the next unicode UTF-8 item into a 2-byte "code point"

View File

@@ -1,4 +1,9 @@
#if defined(WLED_ENABLE_FULL_FONTS)
/*
@title WLED(-MM) - unicode helper functions
@repo https://github.com/MoonModules/WLED-MM, https://github.com/wled/WLED
@Copyright © 2025 Github WLED and WLED-MM Commit Authors (see "git blame" for details)
@license Licensed under the EUPL-1.2 or later
*/
#include "codepages.h"
#include <string.h>
@@ -35,13 +40,15 @@ uint16_t unicodeToWchar16(const unsigned char* utf8, size_t maxLen) {
if (length < 3 || !isValidContinuation(utf8[1]) || !isValidContinuation(utf8[2])) return BAD_CODE; // malformed
codepoint = ((ch0 & 0b00001111) << 12) | ((utf8[1] & 0b00111111) << 6) | (utf8[2] & 0b00111111);
if (codepoint < 0x800) return UNKNOWN_CODE; // Reject overlong encodings (must be >= 0x800)
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return UNKNOWN_CODE; // Reject UTF-16 surrogate pairs (U+D800..U+DFFF)
if (codepoint >= 0x010000) codepoint = UNKNOWN_CODE; // result exceeds uint16_t => "unknown"
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return EXT_CODE; // Reject UTF-16 surrogate pairs (U+D800..U+DFFF)
if (codepoint >= 0x010000) codepoint = EXT_CODE; // result exceeds uint16_t (should not happen with well-formed UTF-8)
return uint16_t(codepoint);
}
}
// 4-byte sequence or invalid lead byte - since we only support up to 0xFFFF, return error marker
return BAD_CODE; // unsupported/invalid
// since we only support up to 0xFFFF, return error marker
if ((ch0 & 0b11111000) == 0b11110000) return EXT_CODE; // unsupported 4-byte sequence
else return BAD_CODE; // other unsupported/invalid
}
// returns a pointer to the next unicode item - can be used to "advance" conversion after unicodeToWchar16()
@@ -99,5 +106,3 @@ size_t cutUnicodeAt(const unsigned char* utf8, size_t where) {
if (utf8[where] > 127) where = max(0, int(where)-1);
return where;
}
#endif