[WIP] Cleanup: cleanup audio code after HDMI switch

2025-09-30 13:32:56 +00:00 · 2025-09-30 13:32:56 +00:00 · 35b5dbd034
parent 7dc57bcdf3
commit 35b5dbd034
1 changed files with 129 additions and 34 deletions
--- a/internal/audio/c/audio.c
+++ b/internal/audio/c/audio.c
@ -126,17 +126,21 @@ void set_trace_logging(int enabled) {
 /**
 * Clear audio buffer using NEON (8 samples/iteration)
 * @param buffer Audio buffer to clear
 * @param samples Number of samples to zero out
 */
 static inline void simd_clear_samples_s16(short *buffer, int samples) {
    simd_init_once();
    const int16x8_t zero = vdupq_n_s16(0);
    int simd_samples = samples & ~7;
    const int16x8_t zero = vdupq_n_s16(0);
    // SIMD path: zero 8 samples per iteration
    for (int i = 0; i < simd_samples; i += 8) {
        vst1q_s16(&buffer[i], zero);
    }
    // Scalar path: handle remaining samples
    for (int i = simd_samples; i < samples; i++) {
        buffer[i] = 0;
    }
@ -144,12 +148,19 @@ static inline void simd_clear_samples_s16(short *buffer, int samples) {
 /**
 * Interleave L/R channels using NEON (8 frames/iteration)
 * Converts separate left/right buffers to interleaved stereo (LRLRLR...)
 * @param left Left channel samples
 * @param right Right channel samples
 * @param output Interleaved stereo output buffer
 * @param frames Number of stereo frames to process
 */
 static inline void simd_interleave_stereo_s16(const short *left, const short *right,
                                             short *output, int frames) {
    simd_init_once();
    int simd_frames = frames & ~7;
    // SIMD path: interleave 8 frames (16 samples) per iteration
    for (int i = 0; i < simd_frames; i += 8) {
        int16x8_t left_vec = vld1q_s16(&left[i]);
        int16x8_t right_vec = vld1q_s16(&right[i]);
@ -158,6 +169,7 @@ static inline void simd_interleave_stereo_s16(const short *left, const short *ri
        vst1q_s16(&output[i * 2 + 8], interleaved.val[1]);
    }
    // Scalar path: handle remaining frames
    for (int i = simd_frames; i < frames; i++) {
        output[i * 2] = left[i];
        output[i * 2 + 1] = right[i];
@ -166,21 +178,28 @@ static inline void simd_interleave_stereo_s16(const short *left, const short *ri
 /**
 * Apply gain using NEON Q15 fixed-point math (8 samples/iteration)
- * Uses vqrdmulhq_s16: single-instruction saturating rounded multiply-high
+ * Uses vqrdmulhq_s16 for single-instruction saturating rounded multiply-high
 * @param samples Audio buffer to scale in-place
 * @param count Number of samples to process
 * @param volume Gain multiplier (e.g., 2.5 for 2.5x gain)
 */
 static inline void simd_scale_volume_s16(short *samples, int count, float volume) {
    simd_init_once();
-    // For vqrdmulhq_s16, multiply volume by 2 since it extracts bits [30:15] not [31:16]
+
    // Convert float gain to Q14 fixed-point for vqrdmulhq_s16
    // vqrdmulhq_s16 extracts bits [30:15], so multiply by 16384 (2^14) instead of 32768 (2^15)
    int16_t vol_fixed = (int16_t)(volume * 16384.0f);
    int16x8_t vol_vec = vdupq_n_s16(vol_fixed);
    int simd_count = count & ~7;
    // SIMD path: process 8 samples per iteration
    for (int i = 0; i < simd_count; i += 8) {
        int16x8_t samples_vec = vld1q_s16(&samples[i]);
        int16x8_t result = vqrdmulhq_s16(samples_vec, vol_vec);
        vst1q_s16(&samples[i], result);
    }
    // Scalar path: handle remaining samples
    for (int i = simd_count; i < count; i++) {
        samples[i] = (short)((samples[i] * vol_fixed) >> 14);
    }
@ -188,10 +207,14 @@ static inline void simd_scale_volume_s16(short *samples, int count, float volume
 /**
 * Byte-swap 16-bit samples using NEON (8 samples/iteration)
 * Converts between little-endian and big-endian formats
 * @param samples Audio buffer to byte-swap in-place
 * @param count Number of samples to process
 */
 static inline void simd_swap_endian_s16(short *samples, int count) {
    int simd_count = count & ~7;
    // SIMD path: swap 8 samples per iteration
    for (int i = 0; i < simd_count; i += 8) {
        uint16x8_t samples_vec = vld1q_u16((uint16_t*)&samples[i]);
        uint8x16_t samples_u8 = vreinterpretq_u8_u16(samples_vec);
@ -200,6 +223,7 @@ static inline void simd_swap_endian_s16(short *samples, int count) {
        vst1q_u16((uint16_t*)&samples[i], swapped);
    }
    // Scalar path: handle remaining samples
    for (int i = simd_count; i < count; i++) {
        samples[i] = __builtin_bswap16(samples[i]);
    }
@ -207,12 +231,17 @@ static inline void simd_swap_endian_s16(short *samples, int count) {
 /**
 * Convert S16 to float using NEON (4 samples/iteration)
 * Converts 16-bit signed integers to normalized float [-1.0, 1.0]
 * @param input S16 audio samples
 * @param output Float output buffer
 * @param count Number of samples to convert
 */
 static inline void simd_s16_to_float(const short *input, float *output, int count) {
    const float scale = 1.0f / 32768.0f;
    float32x4_t scale_vec = vdupq_n_f32(scale);
    int simd_count = count & ~3;
    float32x4_t scale_vec = vdupq_n_f32(scale);
    // SIMD path: convert 4 samples per iteration
    for (int i = 0; i < simd_count; i += 4) {
        int16x4_t s16_data = vld1_s16(input + i);
        int32x4_t s32_data = vmovl_s16(s16_data);
@ -221,6 +250,7 @@ static inline void simd_s16_to_float(const short *input, float *output, int coun
        vst1q_f32(output + i, scaled);
    }
    // Scalar path: handle remaining samples
    for (int i = simd_count; i < count; i++) {
        output[i] = (float)input[i] * scale;
    }
@ -228,12 +258,17 @@ static inline void simd_s16_to_float(const short *input, float *output, int coun
 /**
 * Convert float to S16 using NEON (4 samples/iteration)
 * Converts normalized float [-1.0, 1.0] to 16-bit signed integers with saturation
 * @param input Float audio samples
 * @param output S16 output buffer
 * @param count Number of samples to convert
 */
 static inline void simd_float_to_s16(const float *input, short *output, int count) {
    const float scale = 32767.0f;
    float32x4_t scale_vec = vdupq_n_f32(scale);
    int simd_count = count & ~3;
    float32x4_t scale_vec = vdupq_n_f32(scale);
    // SIMD path: convert 4 samples per iteration with saturation
    for (int i = 0; i < simd_count; i += 4) {
        float32x4_t float_data = vld1q_f32(input + i);
        float32x4_t scaled = vmulq_f32(float_data, scale_vec);
@ -242,6 +277,7 @@ static inline void simd_float_to_s16(const float *input, short *output, int coun
        vst1_s16(output + i, s16_data);
    }
    // Scalar path: handle remaining samples with clamping
    for (int i = simd_count; i < count; i++) {
        float scaled = input[i] * scale;
        output[i] = (short)__builtin_fmaxf(__builtin_fminf(scaled, 32767.0f), -32768.0f);
@ -250,15 +286,22 @@ static inline void simd_float_to_s16(const float *input, short *output, int coun
 /**
 * Mono → stereo (duplicate samples) using NEON (4 frames/iteration)
 * Duplicates mono samples to both L and R channels
 * @param mono Mono input buffer
 * @param stereo Stereo output buffer
 * @param frames Number of frames to process
 */
 static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int frames) {
 	int simd_frames = frames & ~3;
 	// SIMD path: duplicate 4 frames (8 samples) per iteration
 	for (int i = 0; i < simd_frames; i += 4) {
 		int16x4_t mono_data = vld1_s16(mono + i);
 		int16x4x2_t stereo_data = {mono_data, mono_data};
 		vst2_s16(stereo + i * 2, stereo_data);
 	}
 	// Scalar path: handle remaining frames
 	for (int i = simd_frames; i < frames; i++) {
 		stereo[i * 2] = mono[i];
 		stereo[i * 2 + 1] = mono[i];
@ -267,9 +310,15 @@ static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int
 /**
 * Stereo → mono (average L+R) using NEON (4 frames/iteration)
 * Downmixes stereo to mono by averaging left and right channels
 * @param stereo Interleaved stereo input buffer
 * @param mono Mono output buffer
 * @param frames Number of frames to process
 */
 static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int frames) {
 	int simd_frames = frames & ~3;
 	// SIMD path: average 4 stereo frames per iteration
 	for (int i = 0; i < simd_frames; i += 4) {
 		int16x4x2_t stereo_data = vld2_s16(stereo + i * 2);
 		int32x4_t left_wide = vmovl_s16(stereo_data.val[0]);
@ -280,6 +329,7 @@ static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int
 		vst1_s16(mono + i, mono_data);
 	}
 	// Scalar path: handle remaining frames
 	for (int i = simd_frames; i < frames; i++) {
 		mono[i] = (stereo[i * 2] + stereo[i * 2 + 1]) / 2;
 	}
@ -287,14 +337,19 @@ static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int
 /**
 * Apply L/R balance using NEON (4 frames/iteration)
 * Adjusts stereo balance: negative = more left, positive = more right
 * @param stereo Interleaved stereo buffer to modify in-place
 * @param frames Number of stereo frames to process
 * @param balance Balance factor [-1.0 = full left, 0.0 = center, 1.0 = full right]
 */
 static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, float balance) {
 	int simd_frames = frames & ~3;
 	float left_gain = balance <= 0.0f ? 1.0f : 1.0f - balance;
 	float right_gain = balance >= 0.0f ? 1.0f : 1.0f + balance;
 	float32x4_t left_gain_vec = vdupq_n_f32(left_gain);
 	float32x4_t right_gain_vec = vdupq_n_f32(right_gain);
 	int simd_frames = frames & ~3;
 	// SIMD path: apply balance to 4 stereo frames per iteration
 	for (int i = 0; i < simd_frames; i += 4) {
 		int16x4x2_t stereo_data = vld2_s16(stereo + i * 2);
 		int32x4_t left_wide = vmovl_s16(stereo_data.val[0]);
@ -310,6 +365,7 @@ static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, floa
 		vst2_s16(stereo + i * 2, stereo_data);
 	}
 	// Scalar path: handle remaining frames
 	for (int i = simd_frames; i < frames; i++) {
 		stereo[i * 2] = (short)(stereo[i * 2] * left_gain);
 		stereo[i * 2 + 1] = (short)(stereo[i * 2 + 1] * right_gain);
@ -318,16 +374,24 @@ static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, floa
 /**
 * Deinterleave stereo → L/R channels using NEON (4 frames/iteration)
 * Separates interleaved stereo (LRLRLR...) into separate L and R buffers
 * @param interleaved Interleaved stereo input buffer
 * @param left Left channel output buffer
 * @param right Right channel output buffer
 * @param frames Number of stereo frames to process
 */
 static inline void simd_deinterleave_stereo_s16(const short *interleaved, short *left,
                                               short *right, int frames) {
 	int simd_frames = frames & ~3;
 	// SIMD path: deinterleave 4 frames (8 samples) per iteration
 	for (int i = 0; i < simd_frames; i += 4) {
 		int16x4x2_t stereo_data = vld2_s16(interleaved + i * 2);
 		vst1_s16(left + i, stereo_data.val[0]);
 		vst1_s16(right + i, stereo_data.val[1]);
 	}
 	// Scalar path: handle remaining frames
 	for (int i = simd_frames; i < frames; i++) {
 		left[i] = interleaved[i * 2];
 		right[i] = interleaved[i * 2 + 1];
@ -336,23 +400,29 @@ static inline void simd_deinterleave_stereo_s16(const short *interleaved, short
 /**
 * Find max absolute sample value for silence detection using NEON (8 samples/iteration)
- * Used to detect silence (threshold < 50 = ~0.15% max volume)
+ * Used to detect silence (threshold < 50 = ~0.15% max volume) and audio discontinuities
 * @param samples Audio buffer to analyze
 * @param count Number of samples to process
 * @return Maximum absolute sample value in the buffer
 */
 static inline short simd_find_max_abs_s16(const short *samples, int count) {
 	int16x8_t max_vec = vdupq_n_s16(0);
 	int simd_count = count & ~7;
 	int16x8_t max_vec = vdupq_n_s16(0);
 	// SIMD path: find max of 8 samples per iteration
 	for (int i = 0; i < simd_count; i += 8) {
 		int16x8_t samples_vec = vld1q_s16(&samples[i]);
 		int16x8_t abs_vec = vabsq_s16(samples_vec);
 		max_vec = vmaxq_s16(max_vec, abs_vec);
 	}
 	// Horizontal reduction: extract single max value from vector
 	int16x4_t max_half = vmax_s16(vget_low_s16(max_vec), vget_high_s16(max_vec));
 	int16x4_t max_folded = vpmax_s16(max_half, max_half);
 	max_folded = vpmax_s16(max_folded, max_folded);
 	short max_sample = vget_lane_s16(max_folded, 0);
 	// Scalar path: handle remaining samples
 	for (int i = simd_count; i < count; i++) {
 		short abs_sample = samples[i] < 0 ? -samples[i] : samples[i];
 		if (abs_sample > max_sample) {
@ -580,19 +650,28 @@ int jetkvm_audio_capture_init() {
 /**
 * Read HDMI audio, encode to Opus (OUTPUT path hot function)
- * Process: ALSA capture → silence detection → 2.5x gain → Opus encode
+ * Processing pipeline: ALSA capture → silence detection → discontinuity detection → 2.5x gain → Opus encode
- * @return >0 = Opus bytes, 0 = silence/no data, -1 = error
+ * @param opus_buf Output buffer for encoded Opus packet
 * @return >0 = Opus packet size in bytes, 0 = silence/no data, -1 = error
 */
 __attribute__((hot)) int jetkvm_audio_read_encode(void * __restrict__ opus_buf) {
-	static short SIMD_ALIGN pcm_buffer[1920];
+	// Static buffers persist across calls for better cache locality
-	static short prev_max_sample = 0;  // Track previous frame's peak for discontinuity detection
+	static short SIMD_ALIGN pcm_buffer[1920];  // 960 frames × 2 channels
-	unsigned char * __restrict__ out = (unsigned char*)opus_buf;
+	static short prev_max_sample = 0;          // Previous frame peak for discontinuity detection
-	SIMD_PREFETCH(out, 1, 3);
+	// Local variables
-	SIMD_PREFETCH(pcm_buffer, 0, 3);
+	unsigned char * __restrict__ out = (unsigned char*)opus_buf;
 	int pcm_rc;
 	int err = 0;
 	int recovery_attempts = 0;
 	const int max_recovery_attempts = 3;
 	int total_samples;
 	short max_sample;
 	int nb_bytes;
 	// Prefetch output buffer for write
 	SIMD_PREFETCH(out, 1, 3);
 	SIMD_PREFETCH(pcm_buffer, 0, 3);
 	if (__builtin_expect(!capture_initialized || !pcm_capture_handle || !encoder || !opus_buf, 0)) {
 		if (trace_logging_enabled) {
@ -603,8 +682,8 @@ __attribute__((hot)) int jetkvm_audio_read_encode(void * __restrict__ opus_buf)
 	}
 retry_read:
-	;
+	// Read 960 frames (20ms) from ALSA capture device
-	int pcm_rc = snd_pcm_readi(pcm_capture_handle, pcm_buffer, frame_size);
+	pcm_rc = snd_pcm_readi(pcm_capture_handle, pcm_buffer, frame_size);
 	if (__builtin_expect(pcm_rc < 0, 0)) {
 		if (pcm_rc == -EPIPE) {
@ -660,24 +739,26 @@ retry_read:
 		}
 	}
 	// Zero-pad if we got a short read
 	if (__builtin_expect(pcm_rc < frame_size, 0)) {
 		int remaining_samples = (frame_size - pcm_rc) * channels;
 		simd_clear_samples_s16(&pcm_buffer[pcm_rc * channels], remaining_samples);
 	}
-	// Silence detection: only skip true silence (< 50 = ~0.15% of max volume)
+	// Silence detection: skip frames below ~0.15% of maximum volume
-	int total_samples = frame_size * channels;
+	total_samples = frame_size * channels;
-	short max_sample = simd_find_max_abs_s16(pcm_buffer, total_samples);
+	max_sample = simd_find_max_abs_s16(pcm_buffer, total_samples);
 	if (max_sample < 50) {
-		prev_max_sample = 0;  // Reset on silence
+		prev_max_sample = 0;  // Reset discontinuity tracker on silence
 		if (trace_logging_enabled) {
 			printf("[AUDIO_OUTPUT] jetkvm_audio_read_encode: Silence detected (max=%d), skipping frame\n", max_sample);
 		}
 		return 0;
 	}
-	// Detect discontinuity (video seek): abrupt level change >5x
+	// Discontinuity detection: reset encoder on abrupt level changes (video seeks)
 	// Prevents crackling when audio stream jumps due to video seeking
 	if (prev_max_sample > 0) {
 		int level_ratio = (max_sample > prev_max_sample * 5) || (prev_max_sample > max_sample * 5);
 		if (level_ratio) {
@ -689,11 +770,12 @@ retry_read:
 	}
 	prev_max_sample = max_sample;
-	// Apply moderate 2.5x gain to prevent quantization noise on transients
+	// Apply 2.5x gain boost to prevent quantization noise at low volumes
-	// Balances between being audible at low volumes and not overdriving at high volumes
+	// HDMI audio typically transmitted at -6 to -12dB; boost prevents Opus noise floor artifacts
 	simd_scale_volume_s16(pcm_buffer, frame_size * channels, 2.5f);
-	int nb_bytes = opus_encode(encoder, pcm_buffer, frame_size, out, max_packet_size);
+	// Encode PCM to Opus (20ms frame → ~200 bytes at 96kbps)
 	nb_bytes = opus_encode(encoder, pcm_buffer, frame_size, out, max_packet_size);
 	if (trace_logging_enabled && nb_bytes > 0) {
 		printf("[AUDIO_OUTPUT] jetkvm_audio_read_encode: Successfully encoded %d PCM frames to %d Opus bytes\n", pcm_rc, nb_bytes);
@ -767,18 +849,26 @@ int jetkvm_audio_playback_init() {
 /**
 * Decode Opus, write to device speakers (INPUT path hot function)
- * Process: Opus decode → ALSA write with packet loss concealment
+ * Processing pipeline: Opus decode (with FEC) → ALSA playback with error recovery
 * @param opus_buf Encoded Opus packet from client
 * @param opus_size Size of Opus packet in bytes
 * @return >0 = PCM frames written, 0 = frame skipped, -1/-2 = error
 */
 __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf, int opus_size) {
-	static short __attribute__((aligned(16))) pcm_buffer[1920];
+	// Static buffer persists across calls for better cache locality
-	unsigned char * __restrict__ in = (unsigned char*)opus_buf;
+	static short SIMD_ALIGN pcm_buffer[1920];  // 960 frames × 2 channels
-	SIMD_PREFETCH(in, 0, 3);
+	// Local variables
 	unsigned char * __restrict__ in = (unsigned char*)opus_buf;
 	int pcm_frames;
 	int pcm_rc;
 	int err = 0;
 	int recovery_attempts = 0;
 	const int max_recovery_attempts = 3;
 	// Prefetch input buffer for read
 	SIMD_PREFETCH(in, 0, 3);
 	if (__builtin_expect(!playback_initialized || !pcm_playback_handle || !decoder || !opus_buf || opus_size <= 0, 0)) {
 		if (trace_logging_enabled) {
 			printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Failed safety checks - playback_initialized=%d, pcm_playback_handle=%p, decoder=%p, opus_buf=%p, opus_size=%d\n",
@ -798,13 +888,17 @@ __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf,
 		printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Processing Opus packet - size=%d bytes\n", opus_size);
 	}
-	// Decode normally (FEC is automatically used if available in the packet)
+	// Decode Opus packet to PCM (FEC automatically applied if embedded in packet)
-	int pcm_frames = opus_decode(decoder, in, opus_size, pcm_buffer, frame_size, 0);
+	// decode_fec=0 means normal decode (FEC data is used automatically when present)
 	pcm_frames = opus_decode(decoder, in, opus_size, pcm_buffer, frame_size, 0);
 	if (__builtin_expect(pcm_frames < 0, 0)) {
 		// Decode failed - attempt packet loss concealment using FEC from previous packet
 		if (trace_logging_enabled) {
 			printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Opus decode failed with error %d, attempting packet loss concealment\n", pcm_frames);
 		}
-		// Packet loss concealment: decode using FEC from next packet (if available)
+
 		// decode_fec=1 means use FEC data from the NEXT packet to reconstruct THIS lost packet
 		pcm_frames = opus_decode(decoder, NULL, 0, pcm_buffer, frame_size, 1);
 		if (pcm_frames < 0) {
 			if (trace_logging_enabled) {
@ -812,6 +906,7 @@ __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf,
 			}
 			return -1;
 		}
 		if (trace_logging_enabled) {
 			printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Packet loss concealment succeeded, recovered %d frames\n", pcm_frames);
 		}
@ -820,8 +915,8 @@ __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf,
 	}
 retry_write:
-	;
+	// Write decoded PCM to ALSA playback device
-	int pcm_rc = snd_pcm_writei(pcm_playback_handle, pcm_buffer, pcm_frames);
+	pcm_rc = snd_pcm_writei(pcm_playback_handle, pcm_buffer, pcm_frames);
 	if (__builtin_expect(pcm_rc < 0, 0)) {
 		if (trace_logging_enabled) {
 			printf("[AUDIO_INPUT] jetkvm_audio_decode_write: ALSA write failed with error %d (%s), attempt %d/%d\n",