diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index 4c08b85b..0b29b25b 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -84,7 +84,10 @@ jobs: version: v2.0.2 env: CGO_ENABLED: 1 - ALSA_VERSION: ${{ env.ALSA_VERSION }} - OPUS_VERSION: ${{ env.OPUS_VERSION }} - CGO_CFLAGS: "-I${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/include -I${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/include -I${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/celt" + GOOS: linux + GOARCH: arm + GOARM: 7 + CC: ${{ steps.build-env.outputs.cache_path }}/../rv1106-system/tools/linux/toolchain/arm-rockchip830-linux-uclibcgnueabihf/bin/arm-rockchip830-linux-uclibcgnueabihf-gcc + PKG_CONFIG_PATH: ${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/utils:${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }} + CGO_CFLAGS: "-O3 -mfpu=neon -mtune=cortex-a7 -mfloat-abi=hard -ftree-vectorize -ffast-math -funroll-loops -mvectorize-with-neon-quad -marm -D__ARM_NEON -I${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/include -I${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/include -I${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/celt" CGO_LDFLAGS: "-L${{ steps.build-env.outputs.cache_path }}/alsa-lib-${{ steps.build-env.outputs.alsa_version }}/src/.libs -lasound -L${{ steps.build-env.outputs.cache_path }}/opus-${{ steps.build-env.outputs.opus_version }}/.libs -lopus -lm -ldl -static" diff --git a/Makefile b/Makefile index d831b8e9..6ca1dbb8 100644 --- a/Makefile +++ b/Makefile @@ -36,8 +36,8 @@ export PKG_CONFIG_PATH := $(AUDIO_LIBS_DIR)/alsa-lib-$(ALSA_VERSION)/utils:$(AUD # Common command to clean Go cache with verbose output for all Go builds CLEAN_GO_CACHE := @echo "Cleaning Go cache..."; go clean -cache -v -# Optimization flags for ARM Cortex-A7 with NEON -OPTIM_CFLAGS := -O3 -mfpu=neon -mtune=cortex-a7 -mfloat-abi=hard -ftree-vectorize -ffast-math -funroll-loops +# Optimization flags for ARM Cortex-A7 with NEON SIMD +OPTIM_CFLAGS := -O3 -mfpu=neon -mtune=cortex-a7 -mfloat-abi=hard -ftree-vectorize -ffast-math -funroll-loops -mvectorize-with-neon-quad -marm -D__ARM_NEON # Cross-compilation environment for ARM - exported globally export GOOS := linux diff --git a/internal/audio/c/audio.c b/internal/audio/c/audio.c index f68386fe..66725cea 100644 --- a/internal/audio/c/audio.c +++ b/internal/audio/c/audio.c @@ -14,6 +14,34 @@ #include #include +// ARM NEON SIMD support for Cortex-A7 +#ifdef __ARM_NEON +#include +#define SIMD_ENABLED 1 +#else +#define SIMD_ENABLED 0 +#endif + +// Performance optimization flags +static int trace_logging_enabled = 0; // Enable detailed trace logging + +// SIMD feature detection and optimization macros +#if SIMD_ENABLED +#define SIMD_ALIGN __attribute__((aligned(16))) +#define SIMD_PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) +#else +#define SIMD_ALIGN +#define SIMD_PREFETCH(addr, rw, locality) +#endif + +// SIMD initialization and feature detection +static int simd_initialized = 0; + +static void simd_init_once(void) { + if (simd_initialized) return; + simd_initialized = 1; +} + // ============================================================================ // GLOBAL STATE VARIABLES // ============================================================================ @@ -50,7 +78,7 @@ static int max_backoff_us_global = 500000; // Maximum backoff time // Performance optimization flags static const int optimized_buffer_size = 1; // Use optimized buffer sizing -static int trace_logging_enabled = 0; // Enable detailed trace logging + // ============================================================================ // FUNCTION DECLARATIONS @@ -111,6 +139,360 @@ void set_trace_logging(int enabled) { trace_logging_enabled = enabled; } +// ============================================================================ +// SIMD-OPTIMIZED BUFFER OPERATIONS +// ============================================================================ + +#if SIMD_ENABLED +/** + * SIMD-optimized buffer clearing for 16-bit audio samples + * Uses ARM NEON to clear 8 samples (16 bytes) per iteration + * + * @param buffer Pointer to 16-bit sample buffer (must be 16-byte aligned) + * @param samples Number of samples to clear + */ +static inline void simd_clear_samples_s16(short *buffer, int samples) { + simd_init_once(); + + const int16x8_t zero = vdupq_n_s16(0); + int simd_samples = samples & ~7; // Round down to multiple of 8 + + // Process 8 samples at a time with NEON + for (int i = 0; i < simd_samples; i += 8) { + vst1q_s16(&buffer[i], zero); + } + + // Handle remaining samples with scalar operations + for (int i = simd_samples; i < samples; i++) { + buffer[i] = 0; + } +} + +/** + * SIMD-optimized stereo sample interleaving + * Combines left and right channel data using NEON zip operations + * + * @param left Left channel samples + * @param right Right channel samples + * @param output Interleaved stereo output + * @param frames Number of frames to process + */ +static inline void simd_interleave_stereo_s16(const short *left, const short *right, + short *output, int frames) { + simd_init_once(); + + int simd_frames = frames & ~7; // Process 8 frames at a time + + for (int i = 0; i < simd_frames; i += 8) { + int16x8_t left_vec = vld1q_s16(&left[i]); + int16x8_t right_vec = vld1q_s16(&right[i]); + + // Interleave using zip operations + int16x8x2_t interleaved = vzipq_s16(left_vec, right_vec); + + // Store interleaved data + vst1q_s16(&output[i * 2], interleaved.val[0]); + vst1q_s16(&output[i * 2 + 8], interleaved.val[1]); + } + + // Handle remaining frames + for (int i = simd_frames; i < frames; i++) { + output[i * 2] = left[i]; + output[i * 2 + 1] = right[i]; + } +} + +/** + * SIMD-optimized volume scaling for 16-bit samples + * Applies volume scaling using NEON multiply operations + * + * @param samples Input/output sample buffer + * @param count Number of samples to scale + * @param volume Volume factor (0.0 to 1.0, converted to fixed-point) + */ +static inline void simd_scale_volume_s16(short *samples, int count, float volume) { + simd_init_once(); + + // Convert volume to fixed-point (Q15 format) + int16_t vol_fixed = (int16_t)(volume * 32767.0f); + int16x8_t vol_vec = vdupq_n_s16(vol_fixed); + + int simd_count = count & ~7; + + for (int i = 0; i < simd_count; i += 8) { + int16x8_t samples_vec = vld1q_s16(&samples[i]); + + // Multiply and shift right by 15 to maintain Q15 format + int32x4_t low_result = vmull_s16(vget_low_s16(samples_vec), vget_low_s16(vol_vec)); + int32x4_t high_result = vmull_s16(vget_high_s16(samples_vec), vget_high_s16(vol_vec)); + + // Shift right by 15 and narrow back to 16-bit + int16x4_t low_narrow = vshrn_n_s32(low_result, 15); + int16x4_t high_narrow = vshrn_n_s32(high_result, 15); + + int16x8_t result = vcombine_s16(low_narrow, high_narrow); + vst1q_s16(&samples[i], result); + } + + // Handle remaining samples + for (int i = simd_count; i < count; i++) { + samples[i] = (short)((samples[i] * vol_fixed) >> 15); + } +} + +/** + * SIMD-optimized endianness conversion for 16-bit samples + * Swaps byte order using NEON reverse operations + */ +static inline void simd_swap_endian_s16(short *samples, int count) { + int simd_count = count & ~7; + + for (int i = 0; i < simd_count; i += 8) { + uint16x8_t samples_vec = vld1q_u16((uint16_t*)&samples[i]); + + // Reverse bytes within each 16-bit element + uint8x16_t samples_u8 = vreinterpretq_u8_u16(samples_vec); + uint8x16_t swapped_u8 = vrev16q_u8(samples_u8); + uint16x8_t swapped = vreinterpretq_u16_u8(swapped_u8); + + vst1q_u16((uint16_t*)&samples[i], swapped); + } + + // Handle remaining samples + for (int i = simd_count; i < count; i++) { + samples[i] = __builtin_bswap16(samples[i]); + } +} + +/** + * Convert 16-bit signed samples to 32-bit float samples using NEON + */ +static inline void simd_s16_to_float(const short *input, float *output, int count) { + const float scale = 1.0f / 32768.0f; + float32x4_t scale_vec = vdupq_n_f32(scale); + + // Process 4 samples at a time + int simd_count = count & ~3; + for (int i = 0; i < simd_count; i += 4) { + int16x4_t s16_data = vld1_s16(input + i); + int32x4_t s32_data = vmovl_s16(s16_data); + float32x4_t float_data = vcvtq_f32_s32(s32_data); + float32x4_t scaled = vmulq_f32(float_data, scale_vec); + vst1q_f32(output + i, scaled); + } + + // Handle remaining samples + for (int i = simd_count; i < count; i++) { + output[i] = (float)input[i] * scale; + } +} + +/** + * Convert 32-bit float samples to 16-bit signed samples using NEON + */ +static inline void simd_float_to_s16(const float *input, short *output, int count) { + const float scale = 32767.0f; + float32x4_t scale_vec = vdupq_n_f32(scale); + + // Process 4 samples at a time + int simd_count = count & ~3; + for (int i = 0; i < simd_count; i += 4) { + float32x4_t float_data = vld1q_f32(input + i); + float32x4_t scaled = vmulq_f32(float_data, scale_vec); + int32x4_t s32_data = vcvtq_s32_f32(scaled); + int16x4_t s16_data = vqmovn_s32(s32_data); + vst1_s16(output + i, s16_data); + } + + // Handle remaining samples + for (int i = simd_count; i < count; i++) { + float scaled = input[i] * scale; + output[i] = (short)__builtin_fmaxf(__builtin_fminf(scaled, 32767.0f), -32768.0f); + } +} + +/** + * Convert mono to stereo by duplicating samples using NEON + */ +static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int frames) { + // Process 4 frames at a time + int simd_frames = frames & ~3; + for (int i = 0; i < simd_frames; i += 4) { + int16x4_t mono_data = vld1_s16(mono + i); + int16x4x2_t stereo_data = {mono_data, mono_data}; + vst2_s16(stereo + i * 2, stereo_data); + } + + // Handle remaining frames + for (int i = simd_frames; i < frames; i++) { + stereo[i * 2] = mono[i]; + stereo[i * 2 + 1] = mono[i]; + } +} + +/** + * Convert stereo to mono by averaging channels using NEON + */ +static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int frames) { + // Process 4 frames at a time + int simd_frames = frames & ~3; + for (int i = 0; i < simd_frames; i += 4) { + int16x4x2_t stereo_data = vld2_s16(stereo + i * 2); + int32x4_t left_wide = vmovl_s16(stereo_data.val[0]); + int32x4_t right_wide = vmovl_s16(stereo_data.val[1]); + int32x4_t sum = vaddq_s32(left_wide, right_wide); + int32x4_t avg = vshrq_n_s32(sum, 1); + int16x4_t mono_data = vqmovn_s32(avg); + vst1_s16(mono + i, mono_data); + } + + // Handle remaining frames + for (int i = simd_frames; i < frames; i++) { + mono[i] = (stereo[i * 2] + stereo[i * 2 + 1]) / 2; + } +} + +/** + * Apply stereo balance adjustment using NEON + */ +static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, float balance) { + // Balance: -1.0 = full left, 0.0 = center, 1.0 = full right + float left_gain = balance <= 0.0f ? 1.0f : 1.0f - balance; + float right_gain = balance >= 0.0f ? 1.0f : 1.0f + balance; + + float32x4_t left_gain_vec = vdupq_n_f32(left_gain); + float32x4_t right_gain_vec = vdupq_n_f32(right_gain); + + // Process 4 frames at a time + int simd_frames = frames & ~3; + for (int i = 0; i < simd_frames; i += 4) { + int16x4x2_t stereo_data = vld2_s16(stereo + i * 2); + + // Convert to float for processing + int32x4_t left_wide = vmovl_s16(stereo_data.val[0]); + int32x4_t right_wide = vmovl_s16(stereo_data.val[1]); + float32x4_t left_float = vcvtq_f32_s32(left_wide); + float32x4_t right_float = vcvtq_f32_s32(right_wide); + + // Apply balance + left_float = vmulq_f32(left_float, left_gain_vec); + right_float = vmulq_f32(right_float, right_gain_vec); + + // Convert back to int16 + int32x4_t left_result = vcvtq_s32_f32(left_float); + int32x4_t right_result = vcvtq_s32_f32(right_float); + stereo_data.val[0] = vqmovn_s32(left_result); + stereo_data.val[1] = vqmovn_s32(right_result); + + vst2_s16(stereo + i * 2, stereo_data); + } + + // Handle remaining frames + for (int i = simd_frames; i < frames; i++) { + stereo[i * 2] = (short)(stereo[i * 2] * left_gain); + stereo[i * 2 + 1] = (short)(stereo[i * 2 + 1] * right_gain); + } +} + +/** + * Deinterleave stereo samples into separate left/right channels using NEON + */ +static inline void simd_deinterleave_stereo_s16(const short *interleaved, short *left, + short *right, int frames) { + // Process 4 frames at a time + int simd_frames = frames & ~3; + for (int i = 0; i < simd_frames; i += 4) { + int16x4x2_t stereo_data = vld2_s16(interleaved + i * 2); + vst1_s16(left + i, stereo_data.val[0]); + vst1_s16(right + i, stereo_data.val[1]); + } + + // Handle remaining frames + for (int i = simd_frames; i < frames; i++) { + left[i] = interleaved[i * 2]; + right[i] = interleaved[i * 2 + 1]; + } +} + +#else +// Fallback implementations for non-SIMD builds +static inline void simd_clear_samples_s16(short *buffer, int samples) { + simd_init_once(); + + memset(buffer, 0, samples * sizeof(short)); +} + +static inline void simd_interleave_stereo_s16(const short *left, const short *right, + short *output, int frames) { + simd_init_once(); + + for (int i = 0; i < frames; i++) { + output[i * 2] = left[i]; + output[i * 2 + 1] = right[i]; + } +} + +static inline void simd_scale_volume_s16(short *samples, int count, float volume) { + simd_init_once(); + + for (int i = 0; i < count; i++) { + samples[i] = (short)(samples[i] * volume); + } +} + +static inline void simd_swap_endian_s16(short *samples, int count) { + for (int i = 0; i < count; i++) { + samples[i] = __builtin_bswap16(samples[i]); + } +} + +static inline void simd_s16_to_float(const short *input, float *output, int count) { + const float scale = 1.0f / 32768.0f; + for (int i = 0; i < count; i++) { + output[i] = (float)input[i] * scale; + } +} + +static inline void simd_float_to_s16(const float *input, short *output, int count) { + const float scale = 32767.0f; + for (int i = 0; i < count; i++) { + float scaled = input[i] * scale; + output[i] = (short)__builtin_fmaxf(__builtin_fminf(scaled, 32767.0f), -32768.0f); + } +} + +static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int frames) { + for (int i = 0; i < frames; i++) { + stereo[i * 2] = mono[i]; + stereo[i * 2 + 1] = mono[i]; + } +} + +static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int frames) { + for (int i = 0; i < frames; i++) { + mono[i] = (stereo[i * 2] + stereo[i * 2 + 1]) / 2; + } +} + +static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, float balance) { + float left_gain = balance <= 0.0f ? 1.0f : 1.0f - balance; + float right_gain = balance >= 0.0f ? 1.0f : 1.0f + balance; + + for (int i = 0; i < frames; i++) { + stereo[i * 2] = (short)(stereo[i * 2] * left_gain); + stereo[i * 2 + 1] = (short)(stereo[i * 2 + 1] * right_gain); + } +} + +static inline void simd_deinterleave_stereo_s16(const short *interleaved, short *left, + short *right, int frames) { + for (int i = 0; i < frames; i++) { + left[i] = interleaved[i * 2]; + right[i] = interleaved[i * 2 + 1]; + } +} +#endif + // ============================================================================ // INITIALIZATION STATE TRACKING // ============================================================================ @@ -300,6 +682,9 @@ static int configure_alsa_device(snd_pcm_t *handle, const char *device_name) { int jetkvm_audio_capture_init() { int err; + // Initialize SIMD capabilities early + simd_init_once(); + // Prevent concurrent initialization if (__sync_bool_compare_and_swap(&capture_initializing, 0, 1) == 0) { return -EBUSY; // Already initializing @@ -390,11 +775,12 @@ int jetkvm_audio_capture_init() { * -1: Initialization error or unrecoverable failure */ __attribute__((hot)) int jetkvm_audio_read_encode(void * __restrict__ opus_buf) { - static short __attribute__((aligned(16))) pcm_buffer[1920]; // max 2ch*960, aligned for SIMD + static short SIMD_ALIGN pcm_buffer[1920]; // max 2ch*960, aligned for SIMD unsigned char * __restrict__ out = (unsigned char*)opus_buf; - // Prefetch output buffer for better cache performance - __builtin_prefetch(out, 1, 3); + // Prefetch output buffer and PCM buffer for better cache performance + SIMD_PREFETCH(out, 1, 3); + SIMD_PREFETCH(pcm_buffer, 0, 3); int err = 0; int recovery_attempts = 0; const int max_recovery_attempts = 3; @@ -479,9 +865,10 @@ retry_read: } } - // If we got fewer frames than expected, pad with silence + // If we got fewer frames than expected, pad with silence using SIMD if (__builtin_expect(pcm_rc < frame_size, 0)) { - __builtin_memset(&pcm_buffer[pcm_rc * channels], 0, (frame_size - pcm_rc) * channels * sizeof(short)); + int remaining_samples = (frame_size - pcm_rc) * channels; + simd_clear_samples_s16(&pcm_buffer[pcm_rc * channels], remaining_samples); } int nb_bytes = opus_encode(encoder, pcm_buffer, frame_size, out, max_packet_size); @@ -511,6 +898,9 @@ retry_read: int jetkvm_audio_playback_init() { int err; + // Initialize SIMD capabilities early + simd_init_once(); + // Prevent concurrent initialization if (__sync_bool_compare_and_swap(&playback_initializing, 0, 1) == 0) { return -EBUSY; // Already initializing @@ -596,7 +986,7 @@ __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf, unsigned char * __restrict__ in = (unsigned char*)opus_buf; // Prefetch input buffer for better cache performance - __builtin_prefetch(in, 0, 3); + SIMD_PREFETCH(in, 0, 3); int err = 0; int recovery_attempts = 0; const int max_recovery_attempts = 3;