From 532c9b762ce158e42c3967c14e5af0707fc831b4 Mon Sep 17 00:00:00 2001 From: Damian Schneider Date: Mon, 20 Jan 2025 05:51:04 +0100 Subject: [PATCH] replacement for fastled sqrt16() (#4426) * added bitwise operation based sqrt16 - replacement for fastled, it is about 10% slower for numbers smaller 128 but faster for larger numbers. speed difference is irrelevant to WLED but it saves some flash. * updated to 32bit, improved for typical WLED use - making it 32bits allows for larger numbers - added another initial condition check for medium sized numbers - increased the "small number" optimization to larger numbers: the function is currently only used to calculate sqrt(x^2+y^2) which even for small segments is larger than the initially used 64, so optimizing for 1024 makes more sense, although the value is arbitrarily chosen --- wled00/FX.cpp | 10 +++++----- wled00/FX_fcn.cpp | 2 +- wled00/fcn_declare.h | 1 + wled00/wled_math.cpp | 26 +++++++++++++++++++++++++- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/wled00/FX.cpp b/wled00/FX.cpp index cbe57823..34cd661e 100644 --- a/wled00/FX.cpp +++ b/wled00/FX.cpp @@ -6179,15 +6179,15 @@ uint16_t mode_2Dmetaballs(void) { // Metaballs by Stefan Petrick. Cannot have // and add them together with weightening uint16_t dx = abs(x - x1); uint16_t dy = abs(y - y1); - uint16_t dist = 2 * sqrt16((dx * dx) + (dy * dy)); + uint16_t dist = 2 * sqrt32_bw((dx * dx) + (dy * dy)); dx = abs(x - x2); dy = abs(y - y2); - dist += sqrt16((dx * dx) + (dy * dy)); + dist += sqrt32_bw((dx * dx) + (dy * dy)); dx = abs(x - x3); dy = abs(y - y3); - dist += sqrt16((dx * dx) + (dy * dy)); + dist += sqrt32_bw((dx * dx) + (dy * dy)); // inverse result byte color = dist ? 1000 / dist : 255; @@ -11744,7 +11744,7 @@ uint16_t mode_particle1DsonicStream(void) { else PartSys->particles[i].ttl = 0; } if (SEGMENT.check1) { // modulate colors by mid frequencies - int mids = sqrt16((int)fftResult[5] + (int)fftResult[6] + (int)fftResult[7] + (int)fftResult[8] + (int)fftResult[9] + (int)fftResult[10]); // average the mids, bin 5 is ~500Hz, bin 10 is ~2kHz (see audio_reactive.h) + int mids = sqrt32_bw((int)fftResult[5] + (int)fftResult[6] + (int)fftResult[7] + (int)fftResult[8] + (int)fftResult[9] + (int)fftResult[10]); // average the mids, bin 5 is ~500Hz, bin 10 is ~2kHz (see audio_reactive.h) PartSys->particles[i].hue += (mids * perlin8(PartSys->particles[i].x << 2, SEGMENT.step << 2)) >> 9; // color by perlin noise from mid frequencies } } @@ -11832,7 +11832,7 @@ uint16_t mode_particle1DsonicBoom(void) { // particle manipulation for (uint32_t i = 0; i < PartSys->usedParticles; i++) { if (SEGMENT.check1) { // modulate colors by mid frequencies - int mids = sqrt16((int)fftResult[5] + (int)fftResult[6] + (int)fftResult[7] + (int)fftResult[8] + (int)fftResult[9] + (int)fftResult[10]); // average the mids, bin 5 is ~500Hz, bin 10 is ~2kHz (see audio_reactive.h) + int mids = sqrt32_bw((int)fftResult[5] + (int)fftResult[6] + (int)fftResult[7] + (int)fftResult[8] + (int)fftResult[9] + (int)fftResult[10]); // average the mids, bin 5 is ~500Hz, bin 10 is ~2kHz (see audio_reactive.h) PartSys->particles[i].hue += (mids * perlin8(PartSys->particles[i].x << 2, SEGMENT.step << 2)) >> 9; // color by perlin noise from mid frequencies } if (PartSys->particles[i].ttl > 16) { diff --git a/wled00/FX_fcn.cpp b/wled00/FX_fcn.cpp index 8de364a0..996bb57b 100644 --- a/wled00/FX_fcn.cpp +++ b/wled00/FX_fcn.cpp @@ -932,7 +932,7 @@ uint16_t Segment::calc_virtualLength() const { break; case M12_pArc: { unsigned vLen2 = vW * vW + vH * vH; // length ^2 - if (vLen2 < UINT16_MAX) vLen = sqrt16(vLen2); // use faster function for 16bit values + if (vLen2 < UINT16_MAX) vLen = sqrt32_bw(vLen2); // use faster function for 16bit values else vLen = sqrtf(vLen2); // fall-back to float if bigger if (vW != vH) vLen++; // round up } diff --git a/wled00/fcn_declare.h b/wled00/fcn_declare.h index 5cfcff46..bb973dfc 100644 --- a/wled00/fcn_declare.h +++ b/wled00/fcn_declare.h @@ -588,6 +588,7 @@ float fmod_t(float num, float denom); #define cos_t cosf #define tan_t tanf */ +uint32_t sqrt32_bw(uint32_t x); //wled_serial.cpp void handleSerial(); diff --git a/wled00/wled_math.cpp b/wled00/wled_math.cpp index 4905f194..1563dee5 100644 --- a/wled00/wled_math.cpp +++ b/wled00/wled_math.cpp @@ -234,4 +234,28 @@ float fmod_t(float num, float denom) { return res; } -#endif // WLEDMM \ No newline at end of file +#endif // WLEDMM + +// bit-wise integer square root calculation (exact) +uint32_t sqrt32_bw(uint32_t x) { + uint32_t res = 0; + uint32_t bit; + uint32_t num = x; // use 32bit for faster calculation + + if(num < 1 << 10) bit = 1 << 10; // speed optimization for small numbers < 32^2 + else if (num < 1 << 20) bit = 1 << 20; // speed optimization for medium numbers < 1024^2 + else bit = 1 << 30; // start with highest power of 4 <= 2^32 + + while (bit > num) bit >>= 2; // reduce iterations + + while (bit != 0) { + if (num >= res + bit) { + num -= res + bit; + res = (res >> 1) + bit; + } else { + res >>= 1; + } + bit >>= 2; + } + return res; +}