diff --git a/wled00/FX.cpp b/wled00/FX.cpp
index a4a6822c..26dfd110 100644
--- a/wled00/FX.cpp
+++ b/wled00/FX.cpp
@@ -5883,8 +5883,8 @@ static const char _data_FX_MODE_2DSNOWFALL[] PROGMEM = "Snow Fall ☾@!,Spawn Ra
 uint16_t mode_2DHiphotic() {                        //  By: ldirko  https://editor.soulmatelights.com/gallery/810 , Modified by: Andrew Tuline
   if (!strip.isMatrix) return mode_oops(); // not a 2D set-up
 
-  const uint16_t cols = SEGMENT.virtualWidth();
-  const uint16_t rows = SEGMENT.virtualHeight();
+  const uint_fast16_t cols = SEGMENT.virtualWidth();
+  const uint_fast16_t rows = SEGMENT.virtualHeight();
   const uint32_t a = strip.now / ((SEGMENT.custom3>>1)+1);
 
   for (int x = 0; x < cols; x++) {
@@ -8654,10 +8654,10 @@ static const char _data_FX_MODE_2DAKEMI[] PROGMEM = "Akemi@Color speed,Dance;Hea
 uint16_t mode_2Ddistortionwaves() {
   if (!strip.isMatrix) return mode_oops(); // not a 2D set-up
 
-  const uint16_t cols = SEGMENT.virtualWidth();
-  const uint16_t rows = SEGMENT.virtualHeight();
+  const uint_fast16_t cols = SEGMENT.virtualWidth();
+  const uint_fast16_t rows = SEGMENT.virtualHeight();
   if (SEGENV.call == 0) {
-    SEGMENT.setUpLeds();
+    //SEGMENT.setUpLeds();
     SEGMENT.fill(BLACK);
   }
 
diff --git a/wled00/colorTools.hpp b/wled00/colorTools.hpp
index 27249e31..a2986f94 100644
--- a/wled00/colorTools.hpp
+++ b/wled00/colorTools.hpp
@@ -19,6 +19,7 @@
 
 #if !defined(FASTLED_VERSION) // pull in FastLED if we don't have it yet (we need the CRGB type)
   #define FASTLED_INTERNAL
+  #define USE_GET_MILLISECOND_TIMER
   #include <FastLED.h>
 #endif
 
@@ -162,7 +163,8 @@ inline __attribute__((hot)) CRGB ColorFromPaletteWLED(const CRGBPalette16& pal,
     green1 = (green1 * scale) >> 8;
     blue1  = (blue1 * scale) >> 8;
   }
-  return RGBW32(red1,green1,blue1,0);
+  //return RGBW32(red1,green1,blue1,0);
+  return CRGB(red1,green1,blue1);
 }
 #define ColorFromPalette ColorFromPaletteWLED // override fastled function
 
diff --git a/wled00/colors.cpp b/wled00/colors.cpp
index 33b37180..560e4855 100644
--- a/wled00/colors.cpp
+++ b/wled00/colors.cpp
@@ -486,6 +486,9 @@ IRAM_ATTR_YN __attribute__((hot)) uint8_t gamma8_slow(uint8_t b)   // WLEDMM add
   return gammaT[b];
 }
 
+#if defined(ARDUINO_ARCH_ESP32)
+// WLEDMM: gamma32() moved to fcn_declare.h (inlining for speed)
+#else
 // used for color gamma correction
 IRAM_ATTR_YN uint32_t __attribute__((hot)) gamma32(uint32_t color)
 {
@@ -500,3 +503,4 @@ IRAM_ATTR_YN uint32_t __attribute__((hot)) gamma32(uint32_t color)
   b = gammaT[b];
   return RGBW32(r, g, b, w);
 }
+#endif
\ No newline at end of file
diff --git a/wled00/fcn_declare.h b/wled00/fcn_declare.h
index e114bc9f..73b23566 100644
--- a/wled00/fcn_declare.h
+++ b/wled00/fcn_declare.h
@@ -79,7 +79,6 @@ void setRandomColor(byte* rgb);
 uint8_t gamma8_cal(uint8_t b, float gamma);
 void calcGammaTable(float gamma);
 uint8_t __attribute__((pure)) gamma8_slow(uint8_t b);                                         // WLEDMM: added attribute pure
-uint32_t __attribute__((pure)) gamma32(uint32_t);                                             // WLEDMM: added attribute pure
 uint8_t unGamma8(uint8_t value);                                                              // WLEDMM revert gamma correction
 uint32_t unGamma24(uint32_t c);                                                               // WLEDMM for 24bit color (white left as-is)
 
@@ -89,6 +88,34 @@ extern uint8_t gammaT[256];    // colors.cpp
 inline uint8_t gamma8(uint8_t value) { return gammaT[value];}           // WLEDMM inlined for speed
 inline uint8_t fast_unGamma8(uint8_t value) { return gammaTinv[value];}
 
+#if defined(ARDUINO_ARCH_ESP32)
+#if !defined(RGBW32)           // WLEDMM define color macros in case they are missing
+#define RGBW32(r,g,b,w) (uint32_t((byte(w) << 24) | (byte(r) << 16) | (byte(g) << 8) | (byte(b))))
+#endif
+#if !defined(W) && !defined(R) // WLEDMM define color macros in case they are missing
+#define R(c) (byte((c) >> 16))
+#define G(c) (byte((c) >> 8))
+#define B(c) (byte(c))
+#define W(c) (byte((c) >> 24))
+#endif
+
+extern bool gammaCorrectCol;   // wled.h
+inline uint32_t __attribute__((hot)) gamma32(uint32_t color) {          // WLEDMM: moved here for inlining
+  if (!gammaCorrectCol) return color;
+  uint8_t w = W(color);
+  uint8_t r = R(color);
+  uint8_t g = G(color);
+  uint8_t b = B(color);
+  w = gammaT[w];
+  r = gammaT[r];
+  g = gammaT[g];
+  b = gammaT[b];
+  return RGBW32(r, g, b, w);
+}
+#else
+uint32_t __attribute__((pure)) gamma32(uint32_t);
+#endif
+
 #define gamma32inv(c) unGamma24(c)      // WLEDMM alias for upstream compatibility
 #define gamma8inv(c)  fast_unGamma8(c)  // WLEDMM alias for upstream compatibility
 
@@ -489,13 +516,48 @@ void clearEEPROM();
 #endif
 
 //wled_math.cpp
+void init_math();
+
+// WLEDMM: math functions inlined for speed
+
+// 16-bit, integer based Bhaskara I's sine approximation: 16*x*(pi - x) / (5*pi^2 - 4*x*(pi - x))
+// input is 16bit unsigned (0-65535), output is 16bit signed (-32767 to +32767)
+// optimized integer implementation by @dedehai
+inline int16_t sin16_t(uint16_t theta) {
+  int scale = 1;
+  if (theta > 0x7FFF) {
+    theta = 0xFFFF - theta;
+    scale = -1; // second half of the sine function is negative (pi - 2*pi)
+  }
+  uint32_t precal = theta * (0x7FFF - theta);
+  uint64_t numerator = (uint64_t)precal * (4 * 0x7FFF); // 64bit required
+  int32_t denominator = 1342095361 - precal; // 1342095361 is 5 * 0x7FFF^2 / 4
+  int16_t result = numerator / denominator;
+  return result * scale;
+}
+inline int16_t cos16_t(uint16_t theta) {
+  return sin16_t(theta + 0x4000); //cos(x) = sin(x+pi/2)
+}
+
+#if defined(ARDUINO_ARCH_ESP32)
+// WLEDMM: use pre-calculated lookup-table for sin8_t
+extern uint8_t sinT[256];    // wled_math.cpp
+inline uint8_t sin8_t(uint8_t theta) { return sinT[theta];}           
+#else
+// no LUT on 8266, to save 256 bytes of RAM
+inline uint8_t sin8_t(uint8_t theta) {
+  int32_t sin16 = sin16_t((uint16_t)theta * 257); // 255 * 257 = 0xFFFF
+  sin16 += 0x7FFF + 128; //shift result to range 0-0xFFFF, +128 for rounding
+  return min(sin16, int32_t(0xFFFF)) >> 8; // min performs saturation, and prevents overflow
+}
+#endif
+inline uint8_t cos8_t(uint8_t theta) {
+  return sin8_t(theta + 64); //cos(x) = sin(x+pi/2)
+}
+
 //float cos_t(float phi); // use float math
 //float sin_t(float phi);
 //float tan_t(float x);
-int16_t sin16_t(uint16_t theta);
-int16_t cos16_t(uint16_t theta);
-uint8_t sin8_t(uint8_t theta);
-uint8_t cos8_t(uint8_t theta);
 
 float sin_approx(float theta); // uses integer math (converted to float), accuracy +/-0.0015 (compared to sinf())
 float cos_approx(float theta);
diff --git a/wled00/util.cpp b/wled00/util.cpp
index c47a7845..63e8b131 100644
--- a/wled00/util.cpp
+++ b/wled00/util.cpp
@@ -843,7 +843,7 @@ static inline int32_t lerpPerlin(int32_t a, int32_t b, int32_t t) {
 }
 
 // 1D Perlin noise function that returns a value in range of -24691 to 24689
-int32_t perlin1D_raw(uint32_t x, bool is16bit) {
+int32_t IRAM_ATTR_YN perlin1D_raw(uint32_t x, bool is16bit) {
   // integer and fractional part coordinates
   int32_t x0 = x >> 16;
   int32_t x1 = x0 + 1;
@@ -861,7 +861,7 @@ int32_t perlin1D_raw(uint32_t x, bool is16bit) {
 }
 
 // 2D Perlin noise function that returns a value in range of -20633 to 20629
-int32_t perlin2D_raw(uint32_t x, uint32_t y, bool is16bit) {
+int32_t IRAM_ATTR_YN perlin2D_raw(uint32_t x, uint32_t y, bool is16bit) {
   int32_t x0 = x >> 16;
   int32_t y0 = y >> 16;
   int32_t x1 = x0 + 1;
@@ -893,7 +893,7 @@ int32_t perlin2D_raw(uint32_t x, uint32_t y, bool is16bit) {
 }
 
 // 3D Perlin noise function that returns a value in range of -16788 to 16381
-int32_t perlin3D_raw(uint32_t x, uint32_t y, uint32_t z, bool is16bit) {
+int32_t IRAM_ATTR_YN perlin3D_raw(uint32_t x, uint32_t y, uint32_t z, bool is16bit) {
   int32_t x0 = x >> 16;
   int32_t y0 = y >> 16;
   int32_t z0 = z >> 16;
diff --git a/wled00/wled.cpp b/wled00/wled.cpp
index a91280e5..7274f2aa 100644
--- a/wled00/wled.cpp
+++ b/wled00/wled.cpp
@@ -474,6 +474,8 @@ void WLED::setup()
   if (!Serial) delay(300);  // just a tiny wait to avoid problems later when acessing serial
 #endif
 
+  init_math();  // WLEDMM: pre-calculate some lookup tables
+
   #ifdef ARDUINO_ARCH_ESP32
   #if defined(WLED_DEBUG) && (defined(CONFIG_IDF_TARGET_ESP32S2) || defined(CONFIG_IDF_TARGET_ESP32C3) || ARDUINO_USB_CDC_ON_BOOT)
   if (!Serial) delay(2500);  // WLEDMM allow CDC USB serial to initialise (WLED_DEBUG only)
diff --git a/wled00/wled_math.cpp b/wled00/wled_math.cpp
index 1a6c9609..4905f194 100644
--- a/wled00/wled_math.cpp
+++ b/wled00/wled_math.cpp
@@ -59,10 +59,18 @@ float tan_t(float x) {
 }
 */
 
+// WLEDMM: sin16_t() moved to fcn_declare.h (inlining for speed)
+
+// WLEDMM: cos16_t() moved to fcn_declare.h (inlining for speed)
+
+// WLEDMM: sin8_t() moved to fcn_declare.h (inlining for speed)
+
+// WLEDMM: cos8_t() moved to fcn_declare.h (inlining for speed)
+
 // 16-bit, integer based Bhaskara I's sine approximation: 16*x*(pi - x) / (5*pi^2 - 4*x*(pi - x))
 // input is 16bit unsigned (0-65535), output is 16bit signed (-32767 to +32767)
 // optimized integer implementation by @dedehai
-int16_t sin16_t(uint16_t theta) {
+static int16_t sin16_calc(uint16_t theta) {
   int scale = 1;
   if (theta > 0x7FFF) {
     theta = 0xFFFF - theta;
@@ -75,30 +83,34 @@ int16_t sin16_t(uint16_t theta) {
   return result * scale;
 }
 
-int16_t cos16_t(uint16_t theta) {
-  return sin16_t(theta + 0x4000); //cos(x) = sin(x+pi/2)
-}
-
-uint8_t sin8_t(uint8_t theta) {
-  int32_t sin16 = sin16_t((uint16_t)theta * 257); // 255 * 257 = 0xFFFF
+#if defined(ARDUINO_ARCH_ESP32)
+static uint8_t sin8_calc(uint8_t theta) {
+  int32_t sin16 = sin16_calc((uint16_t)theta * 257); // 255 * 257 = 0xFFFF
   sin16 += 0x7FFF + 128; //shift result to range 0-0xFFFF, +128 for rounding
   return min(sin16, int32_t(0xFFFF)) >> 8; // min performs saturation, and prevents overflow
 }
 
-uint8_t cos8_t(uint8_t theta) {
-  return sin8_t(theta + 64); //cos(x) = sin(x+pi/2)
+// WLEDMM: pre-calculate lookup-table for sin8_t
+uint8_t DRAM_ATTR sinT[256];
+void init_math(void) {
+   for (unsigned i = 0; i < 256; i++)
+    sinT[i] = sin8_calc(i);
 }
 
+#else
+void init_math(void) { return;}  // dummy for 8266
+#endif
+
 float sin_approx(float theta) {
   uint16_t scaled_theta = (int)(theta * (float)(0xFFFF / M_TWOPI)); // note: do not cast negative float to uint! cast to int first (undefined on C3)
-  int32_t result = sin16_t(scaled_theta);
+  int32_t result = sin16_calc(scaled_theta);
   float sin = float(result) / 0x7FFF;
   return sin;
 }
 
 float cos_approx(float theta) {
   uint16_t scaled_theta = (int)(theta * (float)(0xFFFF / M_TWOPI)); // note: do not cast negative float to uint! cast to int first (undefined on C3)
-  int32_t result = sin16_t(scaled_theta + 0x4000);
+  int32_t result = sin16_calc(scaled_theta + 0x4000);
   float cos = float(result) / 0x7FFF;
   return cos;
 }