effect math sppedup - up to 3x faster

-> distortion waves 3x speedup -> hiphotic 2x speedup -> waving cell 1.5x speedup * replace sin8_t by lookup-table with pre-computed values * moved integer sin and cos to fcn_declare.h (inlined by the compiler) * moved gamma32 to fcn_declare.h (inlined by the compiler) * a few other small tweaks
2025-12-17 21:59:22 +01:00
parent 49e6de33c7
commit bc7cf062e8
7 changed files with 107 additions and 25 deletions
--- a/wled00/FX.cpp
+++ b/wled00/FX.cpp
@@ -5883,8 +5883,8 @@ static const char _data_FX_MODE_2DSNOWFALL[] PROGMEM = "Snow Fall ☾@!,Spawn Ra
 uint16_t mode_2DHiphotic() {                        //  By: ldirko  https://editor.soulmatelights.com/gallery/810 , Modified by: Andrew Tuline
  if (!strip.isMatrix) return mode_oops(); // not a 2D set-up
-  const uint16_t cols = SEGMENT.virtualWidth();
+  const uint_fast16_t cols = SEGMENT.virtualWidth();
-  const uint16_t rows = SEGMENT.virtualHeight();
+  const uint_fast16_t rows = SEGMENT.virtualHeight();
  const uint32_t a = strip.now / ((SEGMENT.custom3>>1)+1);
  for (int x = 0; x < cols; x++) {
@@ -8654,10 +8654,10 @@ static const char _data_FX_MODE_2DAKEMI[] PROGMEM = "Akemi@Color speed,Dance;Hea
 uint16_t mode_2Ddistortionwaves() {
  if (!strip.isMatrix) return mode_oops(); // not a 2D set-up
-  const uint16_t cols = SEGMENT.virtualWidth();
+  const uint_fast16_t cols = SEGMENT.virtualWidth();
-  const uint16_t rows = SEGMENT.virtualHeight();
+  const uint_fast16_t rows = SEGMENT.virtualHeight();
  if (SEGENV.call == 0) {
-    SEGMENT.setUpLeds();
+    //SEGMENT.setUpLeds();
    SEGMENT.fill(BLACK);
  }
--- a/wled00/colorTools.hpp
+++ b/wled00/colorTools.hpp
@@ -19,6 +19,7 @@
 #if !defined(FASTLED_VERSION) // pull in FastLED if we don't have it yet (we need the CRGB type)
  #define FASTLED_INTERNAL
  #define USE_GET_MILLISECOND_TIMER
  #include <FastLED.h>
 #endif
@@ -162,7 +163,8 @@ inline __attribute__((hot)) CRGB ColorFromPaletteWLED(const CRGBPalette16& pal,
    green1 = (green1 * scale) >> 8;
    blue1  = (blue1 * scale) >> 8;
  }
-  return RGBW32(red1,green1,blue1,0);
+  //return RGBW32(red1,green1,blue1,0);
  return CRGB(red1,green1,blue1);
 }
 #define ColorFromPalette ColorFromPaletteWLED // override fastled function
--- a/wled00/colors.cpp
+++ b/wled00/colors.cpp
@@ -486,6 +486,9 @@ IRAM_ATTR_YN __attribute__((hot)) uint8_t gamma8_slow(uint8_t b)   // WLEDMM add
  return gammaT[b];
 }
 #if defined(ARDUINO_ARCH_ESP32)
 // WLEDMM: gamma32() moved to fcn_declare.h (inlining for speed)
 #else
 // used for color gamma correction
 IRAM_ATTR_YN uint32_t __attribute__((hot)) gamma32(uint32_t color)
 {
@@ -500,3 +503,4 @@ IRAM_ATTR_YN uint32_t __attribute__((hot)) gamma32(uint32_t color)
  b = gammaT[b];
  return RGBW32(r, g, b, w);
 }
 #endif
--- a/wled00/fcn_declare.h
+++ b/wled00/fcn_declare.h
@@ -79,7 +79,6 @@ void setRandomColor(byte* rgb);
 uint8_t gamma8_cal(uint8_t b, float gamma);
 void calcGammaTable(float gamma);
 uint8_t __attribute__((pure)) gamma8_slow(uint8_t b);                                         // WLEDMM: added attribute pure
 uint32_t __attribute__((pure)) gamma32(uint32_t);                                             // WLEDMM: added attribute pure
 uint8_t unGamma8(uint8_t value);                                                              // WLEDMM revert gamma correction
 uint32_t unGamma24(uint32_t c);                                                               // WLEDMM for 24bit color (white left as-is)
@@ -89,6 +88,34 @@ extern uint8_t gammaT[256];    // colors.cpp
 inline uint8_t gamma8(uint8_t value) { return gammaT[value];}           // WLEDMM inlined for speed
 inline uint8_t fast_unGamma8(uint8_t value) { return gammaTinv[value];}
 #if defined(ARDUINO_ARCH_ESP32)
 #if !defined(RGBW32)           // WLEDMM define color macros in case they are missing
 #define RGBW32(r,g,b,w) (uint32_t((byte(w) << 24) | (byte(r) << 16) | (byte(g) << 8) | (byte(b))))
 #endif
 #if !defined(W) && !defined(R) // WLEDMM define color macros in case they are missing
 #define R(c) (byte((c) >> 16))
 #define G(c) (byte((c) >> 8))
 #define B(c) (byte(c))
 #define W(c) (byte((c) >> 24))
 #endif
 extern bool gammaCorrectCol;   // wled.h
 inline uint32_t __attribute__((hot)) gamma32(uint32_t color) {          // WLEDMM: moved here for inlining
  if (!gammaCorrectCol) return color;
  uint8_t w = W(color);
  uint8_t r = R(color);
  uint8_t g = G(color);
  uint8_t b = B(color);
  w = gammaT[w];
  r = gammaT[r];
  g = gammaT[g];
  b = gammaT[b];
  return RGBW32(r, g, b, w);
 }
 #else
 uint32_t __attribute__((pure)) gamma32(uint32_t);
 #endif
 #define gamma32inv(c) unGamma24(c)      // WLEDMM alias for upstream compatibility
 #define gamma8inv(c)  fast_unGamma8(c)  // WLEDMM alias for upstream compatibility
@@ -489,13 +516,48 @@ void clearEEPROM();
 #endif
 //wled_math.cpp
 void init_math();
 // WLEDMM: math functions inlined for speed
 // 16-bit, integer based Bhaskara I's sine approximation: 16*x*(pi - x) / (5*pi^2 - 4*x*(pi - x))
 // input is 16bit unsigned (0-65535), output is 16bit signed (-32767 to +32767)
 // optimized integer implementation by @dedehai
 inline int16_t sin16_t(uint16_t theta) {
  int scale = 1;
  if (theta > 0x7FFF) {
    theta = 0xFFFF - theta;
    scale = -1; // second half of the sine function is negative (pi - 2*pi)
  }
  uint32_t precal = theta * (0x7FFF - theta);
  uint64_t numerator = (uint64_t)precal * (4 * 0x7FFF); // 64bit required
  int32_t denominator = 1342095361 - precal; // 1342095361 is 5 * 0x7FFF^2 / 4
  int16_t result = numerator / denominator;
  return result * scale;
 }
 inline int16_t cos16_t(uint16_t theta) {
  return sin16_t(theta + 0x4000); //cos(x) = sin(x+pi/2)
 }
 #if defined(ARDUINO_ARCH_ESP32)
 // WLEDMM: use pre-calculated lookup-table for sin8_t
 extern uint8_t sinT[256];    // wled_math.cpp
 inline uint8_t sin8_t(uint8_t theta) { return sinT[theta];}           
 #else
 // no LUT on 8266, to save 256 bytes of RAM
 inline uint8_t sin8_t(uint8_t theta) {
  int32_t sin16 = sin16_t((uint16_t)theta * 257); // 255 * 257 = 0xFFFF
  sin16 += 0x7FFF + 128; //shift result to range 0-0xFFFF, +128 for rounding
  return min(sin16, int32_t(0xFFFF)) >> 8; // min performs saturation, and prevents overflow
 }
 #endif
 inline uint8_t cos8_t(uint8_t theta) {
  return sin8_t(theta + 64); //cos(x) = sin(x+pi/2)
 }
 //float cos_t(float phi); // use float math
 //float sin_t(float phi);
 //float tan_t(float x);
 int16_t sin16_t(uint16_t theta);
 int16_t cos16_t(uint16_t theta);
 uint8_t sin8_t(uint8_t theta);
 uint8_t cos8_t(uint8_t theta);
 float sin_approx(float theta); // uses integer math (converted to float), accuracy +/-0.0015 (compared to sinf())
 float cos_approx(float theta);
--- a/wled00/util.cpp
+++ b/wled00/util.cpp
@@ -843,7 +843,7 @@ static inline int32_t lerpPerlin(int32_t a, int32_t b, int32_t t) {
 }
 // 1D Perlin noise function that returns a value in range of -24691 to 24689
-int32_t perlin1D_raw(uint32_t x, bool is16bit) {
+int32_t IRAM_ATTR_YN perlin1D_raw(uint32_t x, bool is16bit) {
  // integer and fractional part coordinates
  int32_t x0 = x >> 16;
  int32_t x1 = x0 + 1;
@@ -861,7 +861,7 @@ int32_t perlin1D_raw(uint32_t x, bool is16bit) {
 }
 // 2D Perlin noise function that returns a value in range of -20633 to 20629
-int32_t perlin2D_raw(uint32_t x, uint32_t y, bool is16bit) {
+int32_t IRAM_ATTR_YN perlin2D_raw(uint32_t x, uint32_t y, bool is16bit) {
  int32_t x0 = x >> 16;
  int32_t y0 = y >> 16;
  int32_t x1 = x0 + 1;
@@ -893,7 +893,7 @@ int32_t perlin2D_raw(uint32_t x, uint32_t y, bool is16bit) {
 }
 // 3D Perlin noise function that returns a value in range of -16788 to 16381
-int32_t perlin3D_raw(uint32_t x, uint32_t y, uint32_t z, bool is16bit) {
+int32_t IRAM_ATTR_YN perlin3D_raw(uint32_t x, uint32_t y, uint32_t z, bool is16bit) {
  int32_t x0 = x >> 16;
  int32_t y0 = y >> 16;
  int32_t z0 = z >> 16;
--- a/wled00/wled.cpp
+++ b/wled00/wled.cpp
@@ -474,6 +474,8 @@ void WLED::setup()
  if (!Serial) delay(300);  // just a tiny wait to avoid problems later when acessing serial
 #endif
  init_math();  // WLEDMM: pre-calculate some lookup tables
  #ifdef ARDUINO_ARCH_ESP32
  #if defined(WLED_DEBUG) && (defined(CONFIG_IDF_TARGET_ESP32S2) || defined(CONFIG_IDF_TARGET_ESP32C3) || ARDUINO_USB_CDC_ON_BOOT)
  if (!Serial) delay(2500);  // WLEDMM allow CDC USB serial to initialise (WLED_DEBUG only)
--- a/wled00/wled_math.cpp
+++ b/wled00/wled_math.cpp
@@ -59,10 +59,18 @@ float tan_t(float x) {
 }
 */
 // WLEDMM: sin16_t() moved to fcn_declare.h (inlining for speed)
 // WLEDMM: cos16_t() moved to fcn_declare.h (inlining for speed)
 // WLEDMM: sin8_t() moved to fcn_declare.h (inlining for speed)
 // WLEDMM: cos8_t() moved to fcn_declare.h (inlining for speed)
 // 16-bit, integer based Bhaskara I's sine approximation: 16*x*(pi - x) / (5*pi^2 - 4*x*(pi - x))
 // input is 16bit unsigned (0-65535), output is 16bit signed (-32767 to +32767)
 // optimized integer implementation by @dedehai
-int16_t sin16_t(uint16_t theta) {
+static int16_t sin16_calc(uint16_t theta) {
  int scale = 1;
  if (theta > 0x7FFF) {
    theta = 0xFFFF - theta;
@@ -75,30 +83,34 @@ int16_t sin16_t(uint16_t theta) {
  return result * scale;
 }
-int16_t cos16_t(uint16_t theta) {
+#if defined(ARDUINO_ARCH_ESP32)
-  return sin16_t(theta + 0x4000); //cos(x) = sin(x+pi/2)
+static uint8_t sin8_calc(uint8_t theta) {
-}
+  int32_t sin16 = sin16_calc((uint16_t)theta * 257); // 255 * 257 = 0xFFFF
 uint8_t sin8_t(uint8_t theta) {
  int32_t sin16 = sin16_t((uint16_t)theta * 257); // 255 * 257 = 0xFFFF
  sin16 += 0x7FFF + 128; //shift result to range 0-0xFFFF, +128 for rounding
  return min(sin16, int32_t(0xFFFF)) >> 8; // min performs saturation, and prevents overflow
 }
-uint8_t cos8_t(uint8_t theta) {
+// WLEDMM: pre-calculate lookup-table for sin8_t
-  return sin8_t(theta + 64); //cos(x) = sin(x+pi/2)
+uint8_t DRAM_ATTR sinT[256];
 void init_math(void) {
   for (unsigned i = 0; i < 256; i++)
    sinT[i] = sin8_calc(i);
 }
 #else
 void init_math(void) { return;}  // dummy for 8266
 #endif
 float sin_approx(float theta) {
  uint16_t scaled_theta = (int)(theta * (float)(0xFFFF / M_TWOPI)); // note: do not cast negative float to uint! cast to int first (undefined on C3)
-  int32_t result = sin16_t(scaled_theta);
+  int32_t result = sin16_calc(scaled_theta);
  float sin = float(result) / 0x7FFF;
  return sin;
 }
 float cos_approx(float theta) {
  uint16_t scaled_theta = (int)(theta * (float)(0xFFFF / M_TWOPI)); // note: do not cast negative float to uint! cast to int first (undefined on C3)
-  int32_t result = sin16_t(scaled_theta + 0x4000);
+  int32_t result = sin16_calc(scaled_theta + 0x4000);
  float cos = float(result) / 0x7FFF;
  return cos;
 }