hot path optimization: make gamma correction inline

* make sure that gamma LUT is always initialized
* remove some unnecessary safety checks
* make gamma8() inline, for more speed
* use fast unGamma8 for preview
* add super-fast unGamma8 for HUB75 - old function lost 3-10 fps, this version does not reduce fps at all *grins*
This commit is contained in:
Frank
2025-11-10 23:17:16 +01:00
parent 4a323ba057
commit 36648d1936
6 changed files with 50 additions and 17 deletions

View File

@@ -610,6 +610,20 @@ uint8_t BusHub75Matrix::activeType = 0;
uint8_t BusHub75Matrix::instanceCount = 0;
uint8_t BusHub75Matrix::last_bri = 0;
#ifndef NO_CIE1931
// WLEDMM speedup: create a version of "unGamma8" that can be inlined by the compiler
extern uint8_t gammaTinv[256]; // defined in colors.cpp
static uint8_t const* myGammaTable = gammaTinv; // local alias for gammaTinv
static inline uint8_t unGamma8_bus(uint8_t value) {
return myGammaTable[value];
}
static inline uint32_t unGamma24_bus(uint32_t c) {
return RGBW32(myGammaTable[R(c)], myGammaTable[G(c)], myGammaTable[B(c)], W(c));
}
#endif
// --------------------------
// Bitdepth reduction based on panel size
@@ -1080,6 +1094,13 @@ BusHub75Matrix::BusHub75Matrix(BusConfig &bc) : Bus(bc.type, bc.start, bc.autoWh
activeFourScanPanel = fourScanPanel;
if (newDisplay) memcpy(&activeMXconfig, &mxconfig, sizeof(mxconfig));
}
#ifndef NO_CIE1931
// force initial calculation of gamma correction tables
if ((gammaCorrectVal < 0.999f) || (gammaCorrectVal > 3.0f)) calcGammaTable(1.0f);
else calcGammaTable(gammaCorrectVal);
#endif
instanceCount++;
USER_PRINT(F("heap usage: ")); USER_PRINTLN(int(lastHeap - ESP.getFreeHeap()));
}
@@ -1142,13 +1163,12 @@ void __attribute__((hot)) IRAM_ATTR BusHub75Matrix::show(void) {
for (int y=0; y<height; y++) for (int x=0; x<width; x++) {
if (getBitFromArray(ledsDirty, pix) == true) { // only repaint the "dirty" pixels
#ifndef NO_CIE1931
uint32_t c = uint32_t(ledBuffer[pix]) & 0x00FFFFFF; // get RGB color, removing FastLED "alpha" component
c = unGamma24(c); // to use the driver linear brightness feature, we first need to undo WLED gamma correction
uint8_t r = R(c);
uint8_t g = G(c);
uint8_t b = B(c);
const CRGB& c = ledBuffer[pix]; // c is an alias for ledBuffer[pix] - avoid creation of a temporary CRGB object instance
uint8_t r = unGamma8_bus(c.r);
uint8_t g = unGamma8_bus(c.g);
uint8_t b = unGamma8_bus(c.b);
#else
const CRGB c = ledBuffer[pix]; // we stay on CRGB, instead of packing/unpacking the color value to uint32_t
const CRGB& c = ledBuffer[pix]; // we stay on CRGB, instead of packing/unpacking the color value to uint32_t
uint8_t r = c.r;
uint8_t g = c.g;
uint8_t b = c.b;

View File

@@ -392,7 +392,7 @@ uint16_t approximateKelvinFromRGB(uint32_t rgb) {
#if !defined(WLED_USE_CIE_BRIGHTNESS_TABLE)
//gamma 2.8 lookup table used for color correction
static byte gammaT[256] = {
byte DRAM_ATTR_YN gammaT[256] = { // WLEDMM: DRAM_ATTR to ensure that this table is in RAM (faster)
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
@@ -415,7 +415,7 @@ static byte gammaT[256] = {
// https://github.com/Aircoookie/WLED/issues/2767#issuecomment-1310961308
// unfortunately NeoPixelBus has its own internal table, that kills low brightness values similar to the original WLED table.
// see https://github.com/Makuna/NeoPixelBus/blob/master/src/internal/NeoGamma.h
static const byte gammaT[256] = {
const DRAM_ATTR_YN byte gammaT[256] = { // WLEDMM make sure this table is in RAM (faster)
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6,
@@ -436,7 +436,7 @@ static const byte gammaT[256] = {
#endif
// WLEDMM begin
static uint8_t gammaTinv[256] = { 0 };
uint8_t DRAM_ATTR_YN gammaTinv[256] = { 0 };
static void calcInvGammaTable(float gamma)
{
float gammaInv = 1.0f / 2.4f; // surprise surprise: WLED palettes use a fixed gamma of 2.4 !!!
@@ -448,9 +448,8 @@ static void calcInvGammaTable(float gamma)
gammaTinv[255]=255;
}
IRAM_ATTR_YN uint8_t __attribute__((hot)) unGamma8(uint8_t value) {
//if (!gammaCorrectCol || (value == 0) || (value == 255)) return value;
if ((gammaCorrectVal < 0.999f) || (gammaCorrectVal > 3.0f)) return value;
if (gammaTinv[255] == 0) calcInvGammaTable(gammaCorrectVal);
//if ((gammaCorrectVal < 0.999f) || (gammaCorrectVal > 3.0f)) return value; // WLEDMM yes, looks stupid
return gammaTinv[value];
}
@@ -482,13 +481,13 @@ void calcGammaTable(float gamma)
}
// used for individual channel or brightness gamma correction
IRAM_ATTR_YN __attribute__((hot)) uint8_t gamma8(uint8_t b) // WLEDMM added IRAM_ATTR_YN
IRAM_ATTR_YN __attribute__((hot)) uint8_t gamma8_slow(uint8_t b) // WLEDMM added IRAM_ATTR_YN
{
return gammaT[b];
}
// used for color gamma correction
uint32_t __attribute__((hot)) gamma32(uint32_t color)
IRAM_ATTR_YN uint32_t __attribute__((hot)) gamma32(uint32_t color)
{
if (!gammaCorrectCol) return color;
uint8_t w = W(color);

View File

@@ -594,8 +594,10 @@
// error only in MM, not in upstream... tbd: find out why
#ifdef ARDUINO_ARCH_ESP32
#define IRAM_ATTR_YN IRAM_ATTR
#define DRAM_ATTR_YN DRAM_ATTR
#else
#define IRAM_ATTR_YN
#define DRAM_ATTR_YN
#endif
#define WLED_O2_ATTR __attribute__((optimize("O2")))

View File

@@ -78,11 +78,17 @@ uint16_t __attribute__((const)) approximateKelvinFromRGB(uint32_t rgb);
void setRandomColor(byte* rgb);
uint8_t gamma8_cal(uint8_t b, float gamma);
void calcGammaTable(float gamma);
uint8_t __attribute__((pure)) gamma8(uint8_t b); // WLEDMM: added attribute pure
uint8_t __attribute__((pure)) gamma8_slow(uint8_t b); // WLEDMM: added attribute pure
uint32_t __attribute__((pure)) gamma32(uint32_t); // WLEDMM: added attribute pure
uint8_t unGamma8(uint8_t value); // WLEDMM revert gamma correction
uint32_t unGamma24(uint32_t c); // WLEDMM for 24bit color (white left as-is)
// WLEDMM: speedup - inline function for gamma correction
extern uint8_t gammaTinv[256]; // colors.cpp
extern uint8_t gammaT[256]; // colors.cpp
inline uint8_t gamma8(uint8_t value) { return gammaT[value];} // WLEDMM inlined for speed
inline uint8_t fast_unGamma8(uint8_t value) { return gammaTinv[value];}
//dmx_output.cpp
void initDMXOutput();
void handleDMXOutput();

View File

@@ -886,6 +886,11 @@ void WLED::setup()
#endif
USER_PRINT(F("Free heap ")); USER_PRINTLN(ESP.getFreeHeap());USER_PRINTLN();
// WLEDMM force initial calculation of gamma correction LUT
if ((gammaCorrectVal < 0.999f) || (gammaCorrectVal > 3.0f)) calcGammaTable(1.0f); // no gamma => create linear LUT
else calcGammaTable(gammaCorrectVal);
USER_PRINTLN(F("WLED initialization done.\n"));
delay(50);
// repeat Ada prompt

View File

@@ -253,6 +253,7 @@ static bool sendLiveLedsWs(uint32_t wsClient) // WLEDMM added "static"
}
#endif
(void) unGamma8(127); // WLEDMM dummy call, just to make sure that gammaTinv is initialized, so we can use fast_unGamma8
uint8_t stripBrightness = strip.getBrightness();
for (size_t i = 0; pos < bufSize -2; i += n)
{
@@ -268,9 +269,9 @@ static bool sendLiveLedsWs(uint32_t wsClient) // WLEDMM added "static"
if (gammaCorrectPreview) {
uint8_t w = W(c); // not sure why, but it looks better if using "white" without corrections
if (w>0) c = color_add(c, RGBW32(w, w, w, 0), false); // add white channel to RGB channels - color_add() will prevent over-saturation
buffer[pos++] = unGamma8(R(c)); //R
buffer[pos++] = unGamma8(G(c)); //G
buffer[pos++] = unGamma8(B(c)); //B
buffer[pos++] = fast_unGamma8(R(c)); //R
buffer[pos++] = fast_unGamma8(G(c)); //G
buffer[pos++] = fast_unGamma8(B(c)); //B
} else {
// WLEDMM end
uint8_t w = W(c); // WLEDMM small optimization