perf(audio): add ARM NEON SIMD optimizations for audio processing

Implement SIMD-optimized audio operations using ARM NEON for Cortex-A7 targets
Update Makefile and CI configuration to support NEON compilation flags
Add SIMD implementations for common audio operations including:
- Sample clearing and interleaving
- Volume scaling and format conversion
- Channel manipulation and balance adjustment
- Endianness swapping and prefetching
This commit is contained in:
Alex P 2025-09-16 18:18:19 +00:00
parent eca3c52513
commit 140a803ccf
3 changed files with 405 additions and 12 deletions

View File

@ -84,7 +84,10 @@ jobs:
version: v2.0.2 version: v2.0.2
env: env:
CGO_ENABLED: 1 CGO_ENABLED: 1
ALSA_VERSION: ${{ env.ALSA_VERSION }} GOOS: linux
OPUS_VERSION: ${{ env.OPUS_VERSION }} GOARCH: arm
CGO_CFLAGS: "-I${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/include -I${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/include -I${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/celt" GOARM: 7
CC: ${{ steps.build-env.outputs.cache_path }}/../rv1106-system/tools/linux/toolchain/arm-rockchip830-linux-uclibcgnueabihf/bin/arm-rockchip830-linux-uclibcgnueabihf-gcc
PKG_CONFIG_PATH: ${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/utils:${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}
CGO_CFLAGS: "-O3 -mfpu=neon -mtune=cortex-a7 -mfloat-abi=hard -ftree-vectorize -ffast-math -funroll-loops -mvectorize-with-neon-quad -marm -D__ARM_NEON -I${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/include -I${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/include -I${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/celt"
CGO_LDFLAGS: "-L${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/src/.libs -lasound -L${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/.libs -lopus -lm -ldl -static" CGO_LDFLAGS: "-L${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/src/.libs -lasound -L${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/.libs -lopus -lm -ldl -static"

View File

@ -36,8 +36,8 @@ export PKG_CONFIG_PATH := $(AUDIO_LIBS_DIR)/alsa-lib-$(ALSA_VERSION)/utils:$(AUD
# Common command to clean Go cache with verbose output for all Go builds # Common command to clean Go cache with verbose output for all Go builds
CLEAN_GO_CACHE := @echo "Cleaning Go cache..."; go clean -cache -v CLEAN_GO_CACHE := @echo "Cleaning Go cache..."; go clean -cache -v
# Optimization flags for ARM Cortex-A7 with NEON # Optimization flags for ARM Cortex-A7 with NEON SIMD
OPTIM_CFLAGS := -O3 -mfpu=neon -mtune=cortex-a7 -mfloat-abi=hard -ftree-vectorize -ffast-math -funroll-loops OPTIM_CFLAGS := -O3 -mfpu=neon -mtune=cortex-a7 -mfloat-abi=hard -ftree-vectorize -ffast-math -funroll-loops -mvectorize-with-neon-quad -marm -D__ARM_NEON
# Cross-compilation environment for ARM - exported globally # Cross-compilation environment for ARM - exported globally
export GOOS := linux export GOOS := linux

View File

@ -14,6 +14,34 @@
#include <unistd.h> #include <unistd.h>
#include <errno.h> #include <errno.h>
// ARM NEON SIMD support for Cortex-A7
#ifdef __ARM_NEON
#include <arm_neon.h>
#define SIMD_ENABLED 1
#else
#define SIMD_ENABLED 0
#endif
// Performance optimization flags
static int trace_logging_enabled = 0; // Enable detailed trace logging
// SIMD feature detection and optimization macros
#if SIMD_ENABLED
#define SIMD_ALIGN __attribute__((aligned(16)))
#define SIMD_PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
#else
#define SIMD_ALIGN
#define SIMD_PREFETCH(addr, rw, locality)
#endif
// SIMD initialization and feature detection
static int simd_initialized = 0;
static void simd_init_once(void) {
if (simd_initialized) return;
simd_initialized = 1;
}
// ============================================================================ // ============================================================================
// GLOBAL STATE VARIABLES // GLOBAL STATE VARIABLES
// ============================================================================ // ============================================================================
@ -50,7 +78,7 @@ static int max_backoff_us_global = 500000; // Maximum backoff time
// Performance optimization flags // Performance optimization flags
static const int optimized_buffer_size = 1; // Use optimized buffer sizing static const int optimized_buffer_size = 1; // Use optimized buffer sizing
static int trace_logging_enabled = 0; // Enable detailed trace logging
// ============================================================================ // ============================================================================
// FUNCTION DECLARATIONS // FUNCTION DECLARATIONS
@ -111,6 +139,360 @@ void set_trace_logging(int enabled) {
trace_logging_enabled = enabled; trace_logging_enabled = enabled;
} }
// ============================================================================
// SIMD-OPTIMIZED BUFFER OPERATIONS
// ============================================================================
#if SIMD_ENABLED
/**
* SIMD-optimized buffer clearing for 16-bit audio samples
* Uses ARM NEON to clear 8 samples (16 bytes) per iteration
*
* @param buffer Pointer to 16-bit sample buffer (must be 16-byte aligned)
* @param samples Number of samples to clear
*/
static inline void simd_clear_samples_s16(short *buffer, int samples) {
simd_init_once();
const int16x8_t zero = vdupq_n_s16(0);
int simd_samples = samples & ~7; // Round down to multiple of 8
// Process 8 samples at a time with NEON
for (int i = 0; i < simd_samples; i += 8) {
vst1q_s16(&buffer[i], zero);
}
// Handle remaining samples with scalar operations
for (int i = simd_samples; i < samples; i++) {
buffer[i] = 0;
}
}
/**
* SIMD-optimized stereo sample interleaving
* Combines left and right channel data using NEON zip operations
*
* @param left Left channel samples
* @param right Right channel samples
* @param output Interleaved stereo output
* @param frames Number of frames to process
*/
static inline void simd_interleave_stereo_s16(const short *left, const short *right,
short *output, int frames) {
simd_init_once();
int simd_frames = frames & ~7; // Process 8 frames at a time
for (int i = 0; i < simd_frames; i += 8) {
int16x8_t left_vec = vld1q_s16(&left[i]);
int16x8_t right_vec = vld1q_s16(&right[i]);
// Interleave using zip operations
int16x8x2_t interleaved = vzipq_s16(left_vec, right_vec);
// Store interleaved data
vst1q_s16(&output[i * 2], interleaved.val[0]);
vst1q_s16(&output[i * 2 + 8], interleaved.val[1]);
}
// Handle remaining frames
for (int i = simd_frames; i < frames; i++) {
output[i * 2] = left[i];
output[i * 2 + 1] = right[i];
}
}
/**
* SIMD-optimized volume scaling for 16-bit samples
* Applies volume scaling using NEON multiply operations
*
* @param samples Input/output sample buffer
* @param count Number of samples to scale
* @param volume Volume factor (0.0 to 1.0, converted to fixed-point)
*/
static inline void simd_scale_volume_s16(short *samples, int count, float volume) {
simd_init_once();
// Convert volume to fixed-point (Q15 format)
int16_t vol_fixed = (int16_t)(volume * 32767.0f);
int16x8_t vol_vec = vdupq_n_s16(vol_fixed);
int simd_count = count & ~7;
for (int i = 0; i < simd_count; i += 8) {
int16x8_t samples_vec = vld1q_s16(&samples[i]);
// Multiply and shift right by 15 to maintain Q15 format
int32x4_t low_result = vmull_s16(vget_low_s16(samples_vec), vget_low_s16(vol_vec));
int32x4_t high_result = vmull_s16(vget_high_s16(samples_vec), vget_high_s16(vol_vec));
// Shift right by 15 and narrow back to 16-bit
int16x4_t low_narrow = vshrn_n_s32(low_result, 15);
int16x4_t high_narrow = vshrn_n_s32(high_result, 15);
int16x8_t result = vcombine_s16(low_narrow, high_narrow);
vst1q_s16(&samples[i], result);
}
// Handle remaining samples
for (int i = simd_count; i < count; i++) {
samples[i] = (short)((samples[i] * vol_fixed) >> 15);
}
}
/**
* SIMD-optimized endianness conversion for 16-bit samples
* Swaps byte order using NEON reverse operations
*/
static inline void simd_swap_endian_s16(short *samples, int count) {
int simd_count = count & ~7;
for (int i = 0; i < simd_count; i += 8) {
uint16x8_t samples_vec = vld1q_u16((uint16_t*)&samples[i]);
// Reverse bytes within each 16-bit element
uint8x16_t samples_u8 = vreinterpretq_u8_u16(samples_vec);
uint8x16_t swapped_u8 = vrev16q_u8(samples_u8);
uint16x8_t swapped = vreinterpretq_u16_u8(swapped_u8);
vst1q_u16((uint16_t*)&samples[i], swapped);
}
// Handle remaining samples
for (int i = simd_count; i < count; i++) {
samples[i] = __builtin_bswap16(samples[i]);
}
}
/**
* Convert 16-bit signed samples to 32-bit float samples using NEON
*/
static inline void simd_s16_to_float(const short *input, float *output, int count) {
const float scale = 1.0f / 32768.0f;
float32x4_t scale_vec = vdupq_n_f32(scale);
// Process 4 samples at a time
int simd_count = count & ~3;
for (int i = 0; i < simd_count; i += 4) {
int16x4_t s16_data = vld1_s16(input + i);
int32x4_t s32_data = vmovl_s16(s16_data);
float32x4_t float_data = vcvtq_f32_s32(s32_data);
float32x4_t scaled = vmulq_f32(float_data, scale_vec);
vst1q_f32(output + i, scaled);
}
// Handle remaining samples
for (int i = simd_count; i < count; i++) {
output[i] = (float)input[i] * scale;
}
}
/**
* Convert 32-bit float samples to 16-bit signed samples using NEON
*/
static inline void simd_float_to_s16(const float *input, short *output, int count) {
const float scale = 32767.0f;
float32x4_t scale_vec = vdupq_n_f32(scale);
// Process 4 samples at a time
int simd_count = count & ~3;
for (int i = 0; i < simd_count; i += 4) {
float32x4_t float_data = vld1q_f32(input + i);
float32x4_t scaled = vmulq_f32(float_data, scale_vec);
int32x4_t s32_data = vcvtq_s32_f32(scaled);
int16x4_t s16_data = vqmovn_s32(s32_data);
vst1_s16(output + i, s16_data);
}
// Handle remaining samples
for (int i = simd_count; i < count; i++) {
float scaled = input[i] * scale;
output[i] = (short)__builtin_fmaxf(__builtin_fminf(scaled, 32767.0f), -32768.0f);
}
}
/**
* Convert mono to stereo by duplicating samples using NEON
*/
static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int frames) {
// Process 4 frames at a time
int simd_frames = frames & ~3;
for (int i = 0; i < simd_frames; i += 4) {
int16x4_t mono_data = vld1_s16(mono + i);
int16x4x2_t stereo_data = {mono_data, mono_data};
vst2_s16(stereo + i * 2, stereo_data);
}
// Handle remaining frames
for (int i = simd_frames; i < frames; i++) {
stereo[i * 2] = mono[i];
stereo[i * 2 + 1] = mono[i];
}
}
/**
* Convert stereo to mono by averaging channels using NEON
*/
static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int frames) {
// Process 4 frames at a time
int simd_frames = frames & ~3;
for (int i = 0; i < simd_frames; i += 4) {
int16x4x2_t stereo_data = vld2_s16(stereo + i * 2);
int32x4_t left_wide = vmovl_s16(stereo_data.val[0]);
int32x4_t right_wide = vmovl_s16(stereo_data.val[1]);
int32x4_t sum = vaddq_s32(left_wide, right_wide);
int32x4_t avg = vshrq_n_s32(sum, 1);
int16x4_t mono_data = vqmovn_s32(avg);
vst1_s16(mono + i, mono_data);
}
// Handle remaining frames
for (int i = simd_frames; i < frames; i++) {
mono[i] = (stereo[i * 2] + stereo[i * 2 + 1]) / 2;
}
}
/**
* Apply stereo balance adjustment using NEON
*/
static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, float balance) {
// Balance: -1.0 = full left, 0.0 = center, 1.0 = full right
float left_gain = balance <= 0.0f ? 1.0f : 1.0f - balance;
float right_gain = balance >= 0.0f ? 1.0f : 1.0f + balance;
float32x4_t left_gain_vec = vdupq_n_f32(left_gain);
float32x4_t right_gain_vec = vdupq_n_f32(right_gain);
// Process 4 frames at a time
int simd_frames = frames & ~3;
for (int i = 0; i < simd_frames; i += 4) {
int16x4x2_t stereo_data = vld2_s16(stereo + i * 2);
// Convert to float for processing
int32x4_t left_wide = vmovl_s16(stereo_data.val[0]);
int32x4_t right_wide = vmovl_s16(stereo_data.val[1]);
float32x4_t left_float = vcvtq_f32_s32(left_wide);
float32x4_t right_float = vcvtq_f32_s32(right_wide);
// Apply balance
left_float = vmulq_f32(left_float, left_gain_vec);
right_float = vmulq_f32(right_float, right_gain_vec);
// Convert back to int16
int32x4_t left_result = vcvtq_s32_f32(left_float);
int32x4_t right_result = vcvtq_s32_f32(right_float);
stereo_data.val[0] = vqmovn_s32(left_result);
stereo_data.val[1] = vqmovn_s32(right_result);
vst2_s16(stereo + i * 2, stereo_data);
}
// Handle remaining frames
for (int i = simd_frames; i < frames; i++) {
stereo[i * 2] = (short)(stereo[i * 2] * left_gain);
stereo[i * 2 + 1] = (short)(stereo[i * 2 + 1] * right_gain);
}
}
/**
* Deinterleave stereo samples into separate left/right channels using NEON
*/
static inline void simd_deinterleave_stereo_s16(const short *interleaved, short *left,
short *right, int frames) {
// Process 4 frames at a time
int simd_frames = frames & ~3;
for (int i = 0; i < simd_frames; i += 4) {
int16x4x2_t stereo_data = vld2_s16(interleaved + i * 2);
vst1_s16(left + i, stereo_data.val[0]);
vst1_s16(right + i, stereo_data.val[1]);
}
// Handle remaining frames
for (int i = simd_frames; i < frames; i++) {
left[i] = interleaved[i * 2];
right[i] = interleaved[i * 2 + 1];
}
}
#else
// Fallback implementations for non-SIMD builds
static inline void simd_clear_samples_s16(short *buffer, int samples) {
simd_init_once();
memset(buffer, 0, samples * sizeof(short));
}
static inline void simd_interleave_stereo_s16(const short *left, const short *right,
short *output, int frames) {
simd_init_once();
for (int i = 0; i < frames; i++) {
output[i * 2] = left[i];
output[i * 2 + 1] = right[i];
}
}
static inline void simd_scale_volume_s16(short *samples, int count, float volume) {
simd_init_once();
for (int i = 0; i < count; i++) {
samples[i] = (short)(samples[i] * volume);
}
}
static inline void simd_swap_endian_s16(short *samples, int count) {
for (int i = 0; i < count; i++) {
samples[i] = __builtin_bswap16(samples[i]);
}
}
static inline void simd_s16_to_float(const short *input, float *output, int count) {
const float scale = 1.0f / 32768.0f;
for (int i = 0; i < count; i++) {
output[i] = (float)input[i] * scale;
}
}
static inline void simd_float_to_s16(const float *input, short *output, int count) {
const float scale = 32767.0f;
for (int i = 0; i < count; i++) {
float scaled = input[i] * scale;
output[i] = (short)__builtin_fmaxf(__builtin_fminf(scaled, 32767.0f), -32768.0f);
}
}
static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int frames) {
for (int i = 0; i < frames; i++) {
stereo[i * 2] = mono[i];
stereo[i * 2 + 1] = mono[i];
}
}
static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int frames) {
for (int i = 0; i < frames; i++) {
mono[i] = (stereo[i * 2] + stereo[i * 2 + 1]) / 2;
}
}
static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, float balance) {
float left_gain = balance <= 0.0f ? 1.0f : 1.0f - balance;
float right_gain = balance >= 0.0f ? 1.0f : 1.0f + balance;
for (int i = 0; i < frames; i++) {
stereo[i * 2] = (short)(stereo[i * 2] * left_gain);
stereo[i * 2 + 1] = (short)(stereo[i * 2 + 1] * right_gain);
}
}
static inline void simd_deinterleave_stereo_s16(const short *interleaved, short *left,
short *right, int frames) {
for (int i = 0; i < frames; i++) {
left[i] = interleaved[i * 2];
right[i] = interleaved[i * 2 + 1];
}
}
#endif
// ============================================================================ // ============================================================================
// INITIALIZATION STATE TRACKING // INITIALIZATION STATE TRACKING
// ============================================================================ // ============================================================================
@ -300,6 +682,9 @@ static int configure_alsa_device(snd_pcm_t *handle, const char *device_name) {
int jetkvm_audio_capture_init() { int jetkvm_audio_capture_init() {
int err; int err;
// Initialize SIMD capabilities early
simd_init_once();
// Prevent concurrent initialization // Prevent concurrent initialization
if (__sync_bool_compare_and_swap(&capture_initializing, 0, 1) == 0) { if (__sync_bool_compare_and_swap(&capture_initializing, 0, 1) == 0) {
return -EBUSY; // Already initializing return -EBUSY; // Already initializing
@ -390,11 +775,12 @@ int jetkvm_audio_capture_init() {
* -1: Initialization error or unrecoverable failure * -1: Initialization error or unrecoverable failure
*/ */
__attribute__((hot)) int jetkvm_audio_read_encode(void * __restrict__ opus_buf) { __attribute__((hot)) int jetkvm_audio_read_encode(void * __restrict__ opus_buf) {
static short __attribute__((aligned(16))) pcm_buffer[1920]; // max 2ch*960, aligned for SIMD static short SIMD_ALIGN pcm_buffer[1920]; // max 2ch*960, aligned for SIMD
unsigned char * __restrict__ out = (unsigned char*)opus_buf; unsigned char * __restrict__ out = (unsigned char*)opus_buf;
// Prefetch output buffer for better cache performance // Prefetch output buffer and PCM buffer for better cache performance
__builtin_prefetch(out, 1, 3); SIMD_PREFETCH(out, 1, 3);
SIMD_PREFETCH(pcm_buffer, 0, 3);
int err = 0; int err = 0;
int recovery_attempts = 0; int recovery_attempts = 0;
const int max_recovery_attempts = 3; const int max_recovery_attempts = 3;
@ -479,9 +865,10 @@ retry_read:
} }
} }
// If we got fewer frames than expected, pad with silence // If we got fewer frames than expected, pad with silence using SIMD
if (__builtin_expect(pcm_rc < frame_size, 0)) { if (__builtin_expect(pcm_rc < frame_size, 0)) {
__builtin_memset(&pcm_buffer[pcm_rc * channels], 0, (frame_size - pcm_rc) * channels * sizeof(short)); int remaining_samples = (frame_size - pcm_rc) * channels;
simd_clear_samples_s16(&pcm_buffer[pcm_rc * channels], remaining_samples);
} }
int nb_bytes = opus_encode(encoder, pcm_buffer, frame_size, out, max_packet_size); int nb_bytes = opus_encode(encoder, pcm_buffer, frame_size, out, max_packet_size);
@ -511,6 +898,9 @@ retry_read:
int jetkvm_audio_playback_init() { int jetkvm_audio_playback_init() {
int err; int err;
// Initialize SIMD capabilities early
simd_init_once();
// Prevent concurrent initialization // Prevent concurrent initialization
if (__sync_bool_compare_and_swap(&playback_initializing, 0, 1) == 0) { if (__sync_bool_compare_and_swap(&playback_initializing, 0, 1) == 0) {
return -EBUSY; // Already initializing return -EBUSY; // Already initializing
@ -596,7 +986,7 @@ __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf,
unsigned char * __restrict__ in = (unsigned char*)opus_buf; unsigned char * __restrict__ in = (unsigned char*)opus_buf;
// Prefetch input buffer for better cache performance // Prefetch input buffer for better cache performance
__builtin_prefetch(in, 0, 3); SIMD_PREFETCH(in, 0, 3);
int err = 0; int err = 0;
int recovery_attempts = 0; int recovery_attempts = 0;
const int max_recovery_attempts = 3; const int max_recovery_attempts = 3;