From ffd0c418e7fdbf14d94c04e99cb6eb85bc287811 Mon Sep 17 00:00:00 2001 From: KubaPro010 Date: Thu, 27 Mar 2025 21:07:20 +0100 Subject: [PATCH] optimize for arm --- lib/filters.c | 55 +++++++++++++++++++++++++++++++++++++----------- lib/filters.h | 7 ++++++ lib/oscillator.c | 23 ++++++++++++++++---- lib/oscillator.h | 7 ++++++ 4 files changed, 76 insertions(+), 16 deletions(-) diff --git a/lib/filters.c b/lib/filters.c index e09a107..25902b0 100644 --- a/lib/filters.c +++ b/lib/filters.c @@ -16,26 +16,57 @@ float hard_clip(float sample, float threshold) { } void init_lpf(LPFFilter *filter, float cutoff, int sample_rate) { - float a = tanf(M_PI*cutoff/sample_rate); - float a2 = a*a; + float a = tanf(M_PI * cutoff / sample_rate); + float a2 = a * a; float r, e; - for(int i = 0; i < LPF_ORDER; i++) { - r = sinf(M_PI*(2.0f*i+1.0f)/(4.0f*LPF_ORDER)); - e = a2+2.0f*a*r+1.0f; - filter->A[i] = a2 / e; - filter->d1[i] = 2.0f*(1.0f-a2)/e; - filter->d2[i] = -(a2 - 2.0f * a * r + 1.0f) / e; + for (int i = 0; i < LPF_ORDER; i++) { + r = sinf(M_PI * (2.0f * i + 1.0f) / (4.0f * LPF_ORDER)); + e = a2 + 2.0f * a * r + 1.0f; + float inv_e = 1.0f / e; + + filter->A[i] = a2 * inv_e; + filter->d1[i] = 2.0f * (1.0f - a2) * inv_e; + filter->d2[i] = -(a2 - 2.0f * a * r + 1.0f) * inv_e; } } float process_lpf(LPFFilter *filter, float x) { float y = x; - for(int i = 0; i < LPF_ORDER; i++) { - filter->w0[i] = filter->d1[i] * filter->w1[i] + filter->d2[i] * filter->w2[i] + y; - y = filter->A[i] * (filter->w0[i] + 2.0f * filter->w1[i] + filter->w2[i]); + +#if USE_NEON // Use NEON if available + float32x4_t v_y = vdupq_n_f32(y); // Load input into all lanes + + for (int i = 0; i < LPF_ORDER; i += 4) { // Process 4 biquads at a time + float32x4_t v_w1 = vld1q_f32(&filter->w1[i]); + float32x4_t v_w2 = vld1q_f32(&filter->w2[i]); + float32x4_t v_d1 = vld1q_f32(&filter->d1[i]); + float32x4_t v_d2 = vld1q_f32(&filter->d2[i]); + float32x4_t v_A = vld1q_f32(&filter->A[i]); + + // Compute w0 = d1 * w1 + d2 * w2 + y + float32x4_t v_w0 = vmlaq_f32(vmulq_f32(v_d1, v_w1), v_d2, v_w2); + v_w0 = vaddq_f32(v_w0, v_y); + + // Compute y = A * (w0 + 2*w1 + w2) + float32x4_t v_tw1 = vaddq_f32(v_w1, v_w1); // 2*w1 + float32x4_t v_sum = vaddq_f32(vaddq_f32(v_w0, v_tw1), v_w2); + v_y = vmulq_f32(v_A, v_sum); // Multiply by A + + // Store updated values + vst1q_f32(&filter->w2[i], v_w1); + vst1q_f32(&filter->w1[i], v_w0); + } + + return vgetq_lane_f32(v_y, 0); // Return first lane of vector + +#else // Scalar fallback if NEON is not available + for (int i = 0; i < LPF_ORDER; i++) { + float w0_new = filter->d1[i] * filter->w1[i] + filter->d2[i] * filter->w2[i] + y; + y = filter->A[i] * (w0_new + 2.0f * filter->w1[i] + filter->w2[i]); filter->w2[i] = filter->w1[i]; - filter->w1[i] = filter->w0[i]; + filter->w1[i] = w0_new; } return y; +#endif } \ No newline at end of file diff --git a/lib/filters.h b/lib/filters.h index cb56adb..eb7d2f5 100644 --- a/lib/filters.h +++ b/lib/filters.h @@ -5,6 +5,13 @@ #include "constants.h" #include "oscillator.h" +#if defined(__ARM_NEON) || defined(__ARM_NEON__) + #include + #define USE_NEON 1 +#else + #define USE_NEON 0 +#endif + #define LPF_ORDER 10 typedef struct diff --git a/lib/oscillator.c b/lib/oscillator.c index 78ebd92..37268b8 100644 --- a/lib/oscillator.c +++ b/lib/oscillator.c @@ -35,7 +35,22 @@ float get_oscillator_cos_multiplier_ni(Oscillator *osc, float multiplier) { } void advance_oscillator(Oscillator *osc) { - osc->phase += osc->phase_increment; - osc->phase -= (osc->phase >= M_2PI) ? M_2PI : 0.0f; - osc->phase = (fabsf(osc->phase) < 1e-10f) ? 0.0f : osc->phase; -} + #if USE_NEON // Use NEON if available + float32x4_t v_phase = vdupq_n_f32(osc->phase); + float32x4_t v_increment = vdupq_n_f32(osc->phase_increment); + float32x4_t v_twopi = vdupq_n_f32(M_2PI); + + v_phase = vaddq_f32(v_phase, v_increment); + uint32x4_t v_mask = vcgeq_f32(v_phase, v_twopi); // Check if phase >= 2π + float32x4_t v_wrapped = vsubq_f32(v_phase, v_twopi); + v_phase = vbslq_f32(v_mask, v_wrapped, v_phase); + + osc->phase = vgetq_lane_f32(v_phase, 0); + + #else // Scalar fallback if NEON is not available + osc->phase += osc->phase_increment; + if (osc->phase >= M_2PI) { + osc->phase -= M_2PI; + } + #endif + } \ No newline at end of file diff --git a/lib/oscillator.h b/lib/oscillator.h index 778c98d..6a7ed8b 100644 --- a/lib/oscillator.h +++ b/lib/oscillator.h @@ -1,5 +1,12 @@ #pragma once +#if defined(__ARM_NEON) || defined(__ARM_NEON__) + #include + #define USE_NEON 1 +#else + #define USE_NEON 0 +#endif + #include "constants.h" #include