diff --git a/wled00/p4_mul16x16.S b/wled00/p4_mul16x16.S new file mode 100644 index 00000000..ea91165a --- /dev/null +++ b/wled00/p4_mul16x16.S @@ -0,0 +1,31 @@ +#if defined(ARDUINO_ARCH_ESP32P4) +.text +.align 4 +.global p4_mul16x16 +.type p4_mul16x16,@function +# ESP32-P4 needs -march rv32imafc_zicsr_zifencei_xesppie -mabi ilp32f +# a0 = out_packet, a1 = brightness, a2 = num_loops, a3 = pixelbuffer +p4_mul16x16: + esp.movx.r.cfg t6 # Enable aligned data access + or t6, t6, 2 # Enable aligned data access + esp.movx.w.cfg t6 # Enable aligned data access + li t6, 8 # put 8 (eventually for vmul bitshift) in temp register 6 + esp.movx.w.sar t6 # set the numbers of bits to right-shift from t6 + li t5, 255 # load 255 into t5 for a comparison + esp.vldbc.8.ip q1, a1, 0 # load the "B" value into q1 from a1, broadcasting the same value to all 16 values of q1 + li t1, 0 # start our loop_num counter t1 at 0 + loop: # "loop" label + beq t1, a2, exit # branch to "exit" if loop_num == num_loops + esp.vld.128.ip q0, a3, 16 # load 16 "A" values into q0 from a3, then move the pointer by 16 to get a new batch + beq a1, t5, skip # If brightness (a1) == 255, jump to "skip" + esp.vmul.u8 q2, q0, q1 # C = A*B (q2 = q0 * q1) then >> by esp.movx.w.sar which we set to 8 + esp.vst.128.ip q2, a0, 16 # store the 16 "C" values into a0, then move the pointer by 16 + j end_skip # jump to "end_skip" + skip: # "skip" label + esp.vst.128.ip q0, a0, 16 # just store brightness (q0 from a3) to packet (a0) + end_skip: # "end_skip" label + addi t1, t1, 1 # increment loop_num counter t1 + j loop # jump to "loop" + exit: # "exit" label + ret # return +#endif