hot path optimization: make gamma correction inline

* make sure that gamma LUT is always initialized * remove some unnecessary safety checks * make gamma8() inline, for more speed * use fast unGamma8 for preview * add super-fast unGamma8 for HUB75 - old function lost 3-10 fps, this version does not reduce fps at all *grins*
2025-11-10 23:17:16 +01:00
parent 4a323ba057
commit 36648d1936
6 changed files with 50 additions and 17 deletions
--- a/wled00/bus_manager.cpp
+++ b/wled00/bus_manager.cpp
@@ -610,6 +610,20 @@ uint8_t BusHub75Matrix::activeType = 0;
 uint8_t BusHub75Matrix::instanceCount = 0;
 uint8_t BusHub75Matrix::last_bri = 0;

+#ifndef NO_CIE1931
+
+// WLEDMM speedup: create a version of "unGamma8" that can be inlined by the compiler
+extern uint8_t gammaTinv[256]; // defined in colors.cpp
+static uint8_t const* myGammaTable = gammaTinv; // local alias for gammaTinv
+
+static inline uint8_t unGamma8_bus(uint8_t value) {
+  return myGammaTable[value];
+}
+static inline uint32_t unGamma24_bus(uint32_t c) {
+  return RGBW32(myGammaTable[R(c)], myGammaTable[G(c)], myGammaTable[B(c)], W(c));
+}
+
+#endif

 // --------------------------
 // Bitdepth reduction based on panel size
@@ -1080,6 +1094,13 @@ BusHub75Matrix::BusHub75Matrix(BusConfig &bc) : Bus(bc.type, bc.start, bc.autoWh
    activeFourScanPanel = fourScanPanel;
    if (newDisplay) memcpy(&activeMXconfig, &mxconfig, sizeof(mxconfig));
  }
+
+#ifndef NO_CIE1931
+  // force initial calculation of gamma correction tables
+  if ((gammaCorrectVal < 0.999f) || (gammaCorrectVal > 3.0f)) calcGammaTable(1.0f);
+  else calcGammaTable(gammaCorrectVal);
+#endif
+
  instanceCount++;
  USER_PRINT(F("heap usage: ")); USER_PRINTLN(int(lastHeap - ESP.getFreeHeap()));
 }
@@ -1142,13 +1163,12 @@ void __attribute__((hot)) IRAM_ATTR BusHub75Matrix::show(void) {
    for (int y=0; y<height; y++) for (int x=0; x<width; x++) {
      if (getBitFromArray(ledsDirty, pix) == true) {        // only repaint the "dirty"  pixels
        #ifndef NO_CIE1931
-        uint32_t c = uint32_t(ledBuffer[pix]) & 0x00FFFFFF; // get RGB color, removing FastLED "alpha" component 
-        c = unGamma24(c); // to use the driver linear brightness feature, we first need to undo WLED gamma correction
-        uint8_t r = R(c);
-        uint8_t g = G(c);
-        uint8_t b = B(c);
+        const CRGB& c = ledBuffer[pix]; // c is an alias for ledBuffer[pix] - avoid creation of a temporary CRGB object instance
+        uint8_t r = unGamma8_bus(c.r);
+        uint8_t g = unGamma8_bus(c.g);
+        uint8_t b = unGamma8_bus(c.b);
        #else
-        const CRGB c = ledBuffer[pix];  // we stay on CRGB, instead of packing/unpacking the color value to uint32_t
+        const CRGB& c = ledBuffer[pix];  // we stay on CRGB, instead of packing/unpacking the color value to uint32_t
        uint8_t r = c.r;
        uint8_t g = c.g;
        uint8_t b = c.b;
--- a/wled00/colors.cpp
+++ b/wled00/colors.cpp
@@ -392,7 +392,7 @@ uint16_t approximateKelvinFromRGB(uint32_t rgb) {

 #if !defined(WLED_USE_CIE_BRIGHTNESS_TABLE)
 //gamma 2.8 lookup table used for color correction
-static byte gammaT[256] = {
+byte DRAM_ATTR_YN gammaT[256] = {  // WLEDMM: DRAM_ATTR to ensure that this table is in RAM (faster)
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,
    1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,
@@ -415,7 +415,7 @@ static byte gammaT[256] = {
 // https://github.com/Aircoookie/WLED/issues/2767#issuecomment-1310961308
 // unfortunately NeoPixelBus has its own internal table, that kills low brightness values similar to the original WLED table.
 //   see https://github.com/Makuna/NeoPixelBus/blob/master/src/internal/NeoGamma.h
-static const byte gammaT[256] = {
+const DRAM_ATTR_YN byte gammaT[256] = {  // WLEDMM make sure this table is in RAM (faster)
  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
  4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 
@@ -436,7 +436,7 @@ static const byte gammaT[256] = {
 #endif

 // WLEDMM begin
-static uint8_t gammaTinv[256] = { 0 };
+uint8_t DRAM_ATTR_YN gammaTinv[256] = { 0 };
 static void calcInvGammaTable(float gamma)
 {
  float gammaInv = 1.0f / 2.4f;    // surprise surprise: WLED palettes use a fixed gamma of 2.4 !!!
@@ -448,9 +448,8 @@ static void calcInvGammaTable(float gamma)
  gammaTinv[255]=255;
 }
 IRAM_ATTR_YN uint8_t __attribute__((hot)) unGamma8(uint8_t value) {
-  //if (!gammaCorrectCol || (value == 0) || (value == 255)) return value;
-  if ((gammaCorrectVal < 0.999f) || (gammaCorrectVal > 3.0f)) return value;
  if (gammaTinv[255] == 0) calcInvGammaTable(gammaCorrectVal);
+  //if ((gammaCorrectVal < 0.999f) || (gammaCorrectVal > 3.0f)) return value; // WLEDMM yes, looks stupid
  return gammaTinv[value];
 }

@@ -482,13 +481,13 @@ void calcGammaTable(float gamma)
 }

 // used for individual channel or brightness gamma correction
-IRAM_ATTR_YN __attribute__((hot)) uint8_t gamma8(uint8_t b)   // WLEDMM added IRAM_ATTR_YN
+IRAM_ATTR_YN __attribute__((hot)) uint8_t gamma8_slow(uint8_t b)   // WLEDMM added IRAM_ATTR_YN
 {
  return gammaT[b];
 }

 // used for color gamma correction
-uint32_t __attribute__((hot)) gamma32(uint32_t color)
+IRAM_ATTR_YN uint32_t __attribute__((hot)) gamma32(uint32_t color)
 {
  if (!gammaCorrectCol) return color;
  uint8_t w = W(color);
--- a/wled00/const.h
+++ b/wled00/const.h
@@ -594,8 +594,10 @@
 //         error only in MM, not in upstream... tbd: find out why
 #ifdef ARDUINO_ARCH_ESP32
  #define IRAM_ATTR_YN IRAM_ATTR
+  #define DRAM_ATTR_YN DRAM_ATTR
 #else
  #define IRAM_ATTR_YN
+  #define DRAM_ATTR_YN 
 #endif

 #define WLED_O2_ATTR __attribute__((optimize("O2")))
--- a/wled00/fcn_declare.h
+++ b/wled00/fcn_declare.h
@@ -78,11 +78,17 @@ uint16_t __attribute__((const)) approximateKelvinFromRGB(uint32_t rgb);
 void setRandomColor(byte* rgb);
 uint8_t gamma8_cal(uint8_t b, float gamma);
 void calcGammaTable(float gamma);
-uint8_t __attribute__((pure)) gamma8(uint8_t b);                                              // WLEDMM: added attribute pure
+uint8_t __attribute__((pure)) gamma8_slow(uint8_t b);                                         // WLEDMM: added attribute pure
 uint32_t __attribute__((pure)) gamma32(uint32_t);                                             // WLEDMM: added attribute pure
 uint8_t unGamma8(uint8_t value);                                                              // WLEDMM revert gamma correction
 uint32_t unGamma24(uint32_t c);                                                               // WLEDMM for 24bit color (white left as-is)

+// WLEDMM: speedup - inline function for gamma correction
+extern uint8_t gammaTinv[256]; // colors.cpp
+extern uint8_t gammaT[256];    // colors.cpp
+inline uint8_t gamma8(uint8_t value) { return gammaT[value];}           // WLEDMM inlined for speed
+inline uint8_t fast_unGamma8(uint8_t value) { return gammaTinv[value];}
+
 //dmx_output.cpp
 void initDMXOutput();
 void handleDMXOutput();
--- a/wled00/wled.cpp
+++ b/wled00/wled.cpp
@@ -886,6 +886,11 @@ void WLED::setup()
 #endif

  USER_PRINT(F("Free heap ")); USER_PRINTLN(ESP.getFreeHeap());USER_PRINTLN();
+
+  // WLEDMM force initial calculation of gamma correction LUT
+  if ((gammaCorrectVal < 0.999f) || (gammaCorrectVal > 3.0f)) calcGammaTable(1.0f); // no gamma => create linear LUT
+  else calcGammaTable(gammaCorrectVal);
+
  USER_PRINTLN(F("WLED initialization done.\n"));
  delay(50);
  // repeat Ada prompt
--- a/wled00/ws.cpp
+++ b/wled00/ws.cpp
@@ -253,6 +253,7 @@ static bool sendLiveLedsWs(uint32_t wsClient)  // WLEDMM added "static"
    }
  #endif

+  (void) unGamma8(127); // WLEDMM dummy call, just to make sure that gammaTinv is initialized, so we can use fast_unGamma8
  uint8_t stripBrightness = strip.getBrightness();
  for (size_t i = 0; pos < bufSize -2; i += n)
  {
@@ -268,9 +269,9 @@ static bool sendLiveLedsWs(uint32_t wsClient)  // WLEDMM added "static"
    if (gammaCorrectPreview) {
      uint8_t w = W(c);  // not sure why, but it looks better if using "white" without corrections
      if (w>0) c = color_add(c, RGBW32(w, w, w, 0), false); // add white channel to RGB channels - color_add() will prevent over-saturation
-      buffer[pos++] = unGamma8(R(c)); //R
-      buffer[pos++] = unGamma8(G(c)); //G
-      buffer[pos++] = unGamma8(B(c)); //B
+      buffer[pos++] = fast_unGamma8(R(c)); //R
+      buffer[pos++] = fast_unGamma8(G(c)); //G
+      buffer[pos++] = fast_unGamma8(B(c)); //B
    } else {
    // WLEDMM end
      uint8_t w = W(c);  // WLEDMM small optimization