[WIP] Cleanup: cleanup audio code after HDMI switch

This commit is contained in:
Alex P 2025-09-30 13:32:56 +00:00
parent 7dc57bcdf3
commit 35b5dbd034
1 changed files with 129 additions and 34 deletions

View File

@ -126,17 +126,21 @@ void set_trace_logging(int enabled) {
/** /**
* Clear audio buffer using NEON (8 samples/iteration) * Clear audio buffer using NEON (8 samples/iteration)
* @param buffer Audio buffer to clear
* @param samples Number of samples to zero out
*/ */
static inline void simd_clear_samples_s16(short *buffer, int samples) { static inline void simd_clear_samples_s16(short *buffer, int samples) {
simd_init_once(); simd_init_once();
const int16x8_t zero = vdupq_n_s16(0);
int simd_samples = samples & ~7; int simd_samples = samples & ~7;
const int16x8_t zero = vdupq_n_s16(0);
// SIMD path: zero 8 samples per iteration
for (int i = 0; i < simd_samples; i += 8) { for (int i = 0; i < simd_samples; i += 8) {
vst1q_s16(&buffer[i], zero); vst1q_s16(&buffer[i], zero);
} }
// Scalar path: handle remaining samples
for (int i = simd_samples; i < samples; i++) { for (int i = simd_samples; i < samples; i++) {
buffer[i] = 0; buffer[i] = 0;
} }
@ -144,12 +148,19 @@ static inline void simd_clear_samples_s16(short *buffer, int samples) {
/** /**
* Interleave L/R channels using NEON (8 frames/iteration) * Interleave L/R channels using NEON (8 frames/iteration)
* Converts separate left/right buffers to interleaved stereo (LRLRLR...)
* @param left Left channel samples
* @param right Right channel samples
* @param output Interleaved stereo output buffer
* @param frames Number of stereo frames to process
*/ */
static inline void simd_interleave_stereo_s16(const short *left, const short *right, static inline void simd_interleave_stereo_s16(const short *left, const short *right,
short *output, int frames) { short *output, int frames) {
simd_init_once(); simd_init_once();
int simd_frames = frames & ~7; int simd_frames = frames & ~7;
// SIMD path: interleave 8 frames (16 samples) per iteration
for (int i = 0; i < simd_frames; i += 8) { for (int i = 0; i < simd_frames; i += 8) {
int16x8_t left_vec = vld1q_s16(&left[i]); int16x8_t left_vec = vld1q_s16(&left[i]);
int16x8_t right_vec = vld1q_s16(&right[i]); int16x8_t right_vec = vld1q_s16(&right[i]);
@ -158,6 +169,7 @@ static inline void simd_interleave_stereo_s16(const short *left, const short *ri
vst1q_s16(&output[i * 2 + 8], interleaved.val[1]); vst1q_s16(&output[i * 2 + 8], interleaved.val[1]);
} }
// Scalar path: handle remaining frames
for (int i = simd_frames; i < frames; i++) { for (int i = simd_frames; i < frames; i++) {
output[i * 2] = left[i]; output[i * 2] = left[i];
output[i * 2 + 1] = right[i]; output[i * 2 + 1] = right[i];
@ -166,21 +178,28 @@ static inline void simd_interleave_stereo_s16(const short *left, const short *ri
/** /**
* Apply gain using NEON Q15 fixed-point math (8 samples/iteration) * Apply gain using NEON Q15 fixed-point math (8 samples/iteration)
* Uses vqrdmulhq_s16: single-instruction saturating rounded multiply-high * Uses vqrdmulhq_s16 for single-instruction saturating rounded multiply-high
* @param samples Audio buffer to scale in-place
* @param count Number of samples to process
* @param volume Gain multiplier (e.g., 2.5 for 2.5x gain)
*/ */
static inline void simd_scale_volume_s16(short *samples, int count, float volume) { static inline void simd_scale_volume_s16(short *samples, int count, float volume) {
simd_init_once(); simd_init_once();
// For vqrdmulhq_s16, multiply volume by 2 since it extracts bits [30:15] not [31:16]
// Convert float gain to Q14 fixed-point for vqrdmulhq_s16
// vqrdmulhq_s16 extracts bits [30:15], so multiply by 16384 (2^14) instead of 32768 (2^15)
int16_t vol_fixed = (int16_t)(volume * 16384.0f); int16_t vol_fixed = (int16_t)(volume * 16384.0f);
int16x8_t vol_vec = vdupq_n_s16(vol_fixed); int16x8_t vol_vec = vdupq_n_s16(vol_fixed);
int simd_count = count & ~7; int simd_count = count & ~7;
// SIMD path: process 8 samples per iteration
for (int i = 0; i < simd_count; i += 8) { for (int i = 0; i < simd_count; i += 8) {
int16x8_t samples_vec = vld1q_s16(&samples[i]); int16x8_t samples_vec = vld1q_s16(&samples[i]);
int16x8_t result = vqrdmulhq_s16(samples_vec, vol_vec); int16x8_t result = vqrdmulhq_s16(samples_vec, vol_vec);
vst1q_s16(&samples[i], result); vst1q_s16(&samples[i], result);
} }
// Scalar path: handle remaining samples
for (int i = simd_count; i < count; i++) { for (int i = simd_count; i < count; i++) {
samples[i] = (short)((samples[i] * vol_fixed) >> 14); samples[i] = (short)((samples[i] * vol_fixed) >> 14);
} }
@ -188,10 +207,14 @@ static inline void simd_scale_volume_s16(short *samples, int count, float volume
/** /**
* Byte-swap 16-bit samples using NEON (8 samples/iteration) * Byte-swap 16-bit samples using NEON (8 samples/iteration)
* Converts between little-endian and big-endian formats
* @param samples Audio buffer to byte-swap in-place
* @param count Number of samples to process
*/ */
static inline void simd_swap_endian_s16(short *samples, int count) { static inline void simd_swap_endian_s16(short *samples, int count) {
int simd_count = count & ~7; int simd_count = count & ~7;
// SIMD path: swap 8 samples per iteration
for (int i = 0; i < simd_count; i += 8) { for (int i = 0; i < simd_count; i += 8) {
uint16x8_t samples_vec = vld1q_u16((uint16_t*)&samples[i]); uint16x8_t samples_vec = vld1q_u16((uint16_t*)&samples[i]);
uint8x16_t samples_u8 = vreinterpretq_u8_u16(samples_vec); uint8x16_t samples_u8 = vreinterpretq_u8_u16(samples_vec);
@ -200,6 +223,7 @@ static inline void simd_swap_endian_s16(short *samples, int count) {
vst1q_u16((uint16_t*)&samples[i], swapped); vst1q_u16((uint16_t*)&samples[i], swapped);
} }
// Scalar path: handle remaining samples
for (int i = simd_count; i < count; i++) { for (int i = simd_count; i < count; i++) {
samples[i] = __builtin_bswap16(samples[i]); samples[i] = __builtin_bswap16(samples[i]);
} }
@ -207,12 +231,17 @@ static inline void simd_swap_endian_s16(short *samples, int count) {
/** /**
* Convert S16 to float using NEON (4 samples/iteration) * Convert S16 to float using NEON (4 samples/iteration)
* Converts 16-bit signed integers to normalized float [-1.0, 1.0]
* @param input S16 audio samples
* @param output Float output buffer
* @param count Number of samples to convert
*/ */
static inline void simd_s16_to_float(const short *input, float *output, int count) { static inline void simd_s16_to_float(const short *input, float *output, int count) {
const float scale = 1.0f / 32768.0f; const float scale = 1.0f / 32768.0f;
float32x4_t scale_vec = vdupq_n_f32(scale);
int simd_count = count & ~3; int simd_count = count & ~3;
float32x4_t scale_vec = vdupq_n_f32(scale);
// SIMD path: convert 4 samples per iteration
for (int i = 0; i < simd_count; i += 4) { for (int i = 0; i < simd_count; i += 4) {
int16x4_t s16_data = vld1_s16(input + i); int16x4_t s16_data = vld1_s16(input + i);
int32x4_t s32_data = vmovl_s16(s16_data); int32x4_t s32_data = vmovl_s16(s16_data);
@ -221,6 +250,7 @@ static inline void simd_s16_to_float(const short *input, float *output, int coun
vst1q_f32(output + i, scaled); vst1q_f32(output + i, scaled);
} }
// Scalar path: handle remaining samples
for (int i = simd_count; i < count; i++) { for (int i = simd_count; i < count; i++) {
output[i] = (float)input[i] * scale; output[i] = (float)input[i] * scale;
} }
@ -228,12 +258,17 @@ static inline void simd_s16_to_float(const short *input, float *output, int coun
/** /**
* Convert float to S16 using NEON (4 samples/iteration) * Convert float to S16 using NEON (4 samples/iteration)
* Converts normalized float [-1.0, 1.0] to 16-bit signed integers with saturation
* @param input Float audio samples
* @param output S16 output buffer
* @param count Number of samples to convert
*/ */
static inline void simd_float_to_s16(const float *input, short *output, int count) { static inline void simd_float_to_s16(const float *input, short *output, int count) {
const float scale = 32767.0f; const float scale = 32767.0f;
float32x4_t scale_vec = vdupq_n_f32(scale);
int simd_count = count & ~3; int simd_count = count & ~3;
float32x4_t scale_vec = vdupq_n_f32(scale);
// SIMD path: convert 4 samples per iteration with saturation
for (int i = 0; i < simd_count; i += 4) { for (int i = 0; i < simd_count; i += 4) {
float32x4_t float_data = vld1q_f32(input + i); float32x4_t float_data = vld1q_f32(input + i);
float32x4_t scaled = vmulq_f32(float_data, scale_vec); float32x4_t scaled = vmulq_f32(float_data, scale_vec);
@ -242,6 +277,7 @@ static inline void simd_float_to_s16(const float *input, short *output, int coun
vst1_s16(output + i, s16_data); vst1_s16(output + i, s16_data);
} }
// Scalar path: handle remaining samples with clamping
for (int i = simd_count; i < count; i++) { for (int i = simd_count; i < count; i++) {
float scaled = input[i] * scale; float scaled = input[i] * scale;
output[i] = (short)__builtin_fmaxf(__builtin_fminf(scaled, 32767.0f), -32768.0f); output[i] = (short)__builtin_fmaxf(__builtin_fminf(scaled, 32767.0f), -32768.0f);
@ -250,15 +286,22 @@ static inline void simd_float_to_s16(const float *input, short *output, int coun
/** /**
* Mono stereo (duplicate samples) using NEON (4 frames/iteration) * Mono stereo (duplicate samples) using NEON (4 frames/iteration)
* Duplicates mono samples to both L and R channels
* @param mono Mono input buffer
* @param stereo Stereo output buffer
* @param frames Number of frames to process
*/ */
static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int frames) { static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int frames) {
int simd_frames = frames & ~3; int simd_frames = frames & ~3;
// SIMD path: duplicate 4 frames (8 samples) per iteration
for (int i = 0; i < simd_frames; i += 4) { for (int i = 0; i < simd_frames; i += 4) {
int16x4_t mono_data = vld1_s16(mono + i); int16x4_t mono_data = vld1_s16(mono + i);
int16x4x2_t stereo_data = {mono_data, mono_data}; int16x4x2_t stereo_data = {mono_data, mono_data};
vst2_s16(stereo + i * 2, stereo_data); vst2_s16(stereo + i * 2, stereo_data);
} }
// Scalar path: handle remaining frames
for (int i = simd_frames; i < frames; i++) { for (int i = simd_frames; i < frames; i++) {
stereo[i * 2] = mono[i]; stereo[i * 2] = mono[i];
stereo[i * 2 + 1] = mono[i]; stereo[i * 2 + 1] = mono[i];
@ -267,9 +310,15 @@ static inline void simd_mono_to_stereo_s16(const short *mono, short *stereo, int
/** /**
* Stereo mono (average L+R) using NEON (4 frames/iteration) * Stereo mono (average L+R) using NEON (4 frames/iteration)
* Downmixes stereo to mono by averaging left and right channels
* @param stereo Interleaved stereo input buffer
* @param mono Mono output buffer
* @param frames Number of frames to process
*/ */
static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int frames) { static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int frames) {
int simd_frames = frames & ~3; int simd_frames = frames & ~3;
// SIMD path: average 4 stereo frames per iteration
for (int i = 0; i < simd_frames; i += 4) { for (int i = 0; i < simd_frames; i += 4) {
int16x4x2_t stereo_data = vld2_s16(stereo + i * 2); int16x4x2_t stereo_data = vld2_s16(stereo + i * 2);
int32x4_t left_wide = vmovl_s16(stereo_data.val[0]); int32x4_t left_wide = vmovl_s16(stereo_data.val[0]);
@ -280,6 +329,7 @@ static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int
vst1_s16(mono + i, mono_data); vst1_s16(mono + i, mono_data);
} }
// Scalar path: handle remaining frames
for (int i = simd_frames; i < frames; i++) { for (int i = simd_frames; i < frames; i++) {
mono[i] = (stereo[i * 2] + stereo[i * 2 + 1]) / 2; mono[i] = (stereo[i * 2] + stereo[i * 2 + 1]) / 2;
} }
@ -287,14 +337,19 @@ static inline void simd_stereo_to_mono_s16(const short *stereo, short *mono, int
/** /**
* Apply L/R balance using NEON (4 frames/iteration) * Apply L/R balance using NEON (4 frames/iteration)
* Adjusts stereo balance: negative = more left, positive = more right
* @param stereo Interleaved stereo buffer to modify in-place
* @param frames Number of stereo frames to process
* @param balance Balance factor [-1.0 = full left, 0.0 = center, 1.0 = full right]
*/ */
static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, float balance) { static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, float balance) {
int simd_frames = frames & ~3;
float left_gain = balance <= 0.0f ? 1.0f : 1.0f - balance; float left_gain = balance <= 0.0f ? 1.0f : 1.0f - balance;
float right_gain = balance >= 0.0f ? 1.0f : 1.0f + balance; float right_gain = balance >= 0.0f ? 1.0f : 1.0f + balance;
float32x4_t left_gain_vec = vdupq_n_f32(left_gain); float32x4_t left_gain_vec = vdupq_n_f32(left_gain);
float32x4_t right_gain_vec = vdupq_n_f32(right_gain); float32x4_t right_gain_vec = vdupq_n_f32(right_gain);
int simd_frames = frames & ~3;
// SIMD path: apply balance to 4 stereo frames per iteration
for (int i = 0; i < simd_frames; i += 4) { for (int i = 0; i < simd_frames; i += 4) {
int16x4x2_t stereo_data = vld2_s16(stereo + i * 2); int16x4x2_t stereo_data = vld2_s16(stereo + i * 2);
int32x4_t left_wide = vmovl_s16(stereo_data.val[0]); int32x4_t left_wide = vmovl_s16(stereo_data.val[0]);
@ -310,6 +365,7 @@ static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, floa
vst2_s16(stereo + i * 2, stereo_data); vst2_s16(stereo + i * 2, stereo_data);
} }
// Scalar path: handle remaining frames
for (int i = simd_frames; i < frames; i++) { for (int i = simd_frames; i < frames; i++) {
stereo[i * 2] = (short)(stereo[i * 2] * left_gain); stereo[i * 2] = (short)(stereo[i * 2] * left_gain);
stereo[i * 2 + 1] = (short)(stereo[i * 2 + 1] * right_gain); stereo[i * 2 + 1] = (short)(stereo[i * 2 + 1] * right_gain);
@ -318,16 +374,24 @@ static inline void simd_apply_stereo_balance_s16(short *stereo, int frames, floa
/** /**
* Deinterleave stereo L/R channels using NEON (4 frames/iteration) * Deinterleave stereo L/R channels using NEON (4 frames/iteration)
* Separates interleaved stereo (LRLRLR...) into separate L and R buffers
* @param interleaved Interleaved stereo input buffer
* @param left Left channel output buffer
* @param right Right channel output buffer
* @param frames Number of stereo frames to process
*/ */
static inline void simd_deinterleave_stereo_s16(const short *interleaved, short *left, static inline void simd_deinterleave_stereo_s16(const short *interleaved, short *left,
short *right, int frames) { short *right, int frames) {
int simd_frames = frames & ~3; int simd_frames = frames & ~3;
// SIMD path: deinterleave 4 frames (8 samples) per iteration
for (int i = 0; i < simd_frames; i += 4) { for (int i = 0; i < simd_frames; i += 4) {
int16x4x2_t stereo_data = vld2_s16(interleaved + i * 2); int16x4x2_t stereo_data = vld2_s16(interleaved + i * 2);
vst1_s16(left + i, stereo_data.val[0]); vst1_s16(left + i, stereo_data.val[0]);
vst1_s16(right + i, stereo_data.val[1]); vst1_s16(right + i, stereo_data.val[1]);
} }
// Scalar path: handle remaining frames
for (int i = simd_frames; i < frames; i++) { for (int i = simd_frames; i < frames; i++) {
left[i] = interleaved[i * 2]; left[i] = interleaved[i * 2];
right[i] = interleaved[i * 2 + 1]; right[i] = interleaved[i * 2 + 1];
@ -336,23 +400,29 @@ static inline void simd_deinterleave_stereo_s16(const short *interleaved, short
/** /**
* Find max absolute sample value for silence detection using NEON (8 samples/iteration) * Find max absolute sample value for silence detection using NEON (8 samples/iteration)
* Used to detect silence (threshold < 50 = ~0.15% max volume) * Used to detect silence (threshold < 50 = ~0.15% max volume) and audio discontinuities
* @param samples Audio buffer to analyze
* @param count Number of samples to process
* @return Maximum absolute sample value in the buffer
*/ */
static inline short simd_find_max_abs_s16(const short *samples, int count) { static inline short simd_find_max_abs_s16(const short *samples, int count) {
int16x8_t max_vec = vdupq_n_s16(0);
int simd_count = count & ~7; int simd_count = count & ~7;
int16x8_t max_vec = vdupq_n_s16(0);
// SIMD path: find max of 8 samples per iteration
for (int i = 0; i < simd_count; i += 8) { for (int i = 0; i < simd_count; i += 8) {
int16x8_t samples_vec = vld1q_s16(&samples[i]); int16x8_t samples_vec = vld1q_s16(&samples[i]);
int16x8_t abs_vec = vabsq_s16(samples_vec); int16x8_t abs_vec = vabsq_s16(samples_vec);
max_vec = vmaxq_s16(max_vec, abs_vec); max_vec = vmaxq_s16(max_vec, abs_vec);
} }
// Horizontal reduction: extract single max value from vector
int16x4_t max_half = vmax_s16(vget_low_s16(max_vec), vget_high_s16(max_vec)); int16x4_t max_half = vmax_s16(vget_low_s16(max_vec), vget_high_s16(max_vec));
int16x4_t max_folded = vpmax_s16(max_half, max_half); int16x4_t max_folded = vpmax_s16(max_half, max_half);
max_folded = vpmax_s16(max_folded, max_folded); max_folded = vpmax_s16(max_folded, max_folded);
short max_sample = vget_lane_s16(max_folded, 0); short max_sample = vget_lane_s16(max_folded, 0);
// Scalar path: handle remaining samples
for (int i = simd_count; i < count; i++) { for (int i = simd_count; i < count; i++) {
short abs_sample = samples[i] < 0 ? -samples[i] : samples[i]; short abs_sample = samples[i] < 0 ? -samples[i] : samples[i];
if (abs_sample > max_sample) { if (abs_sample > max_sample) {
@ -580,19 +650,28 @@ int jetkvm_audio_capture_init() {
/** /**
* Read HDMI audio, encode to Opus (OUTPUT path hot function) * Read HDMI audio, encode to Opus (OUTPUT path hot function)
* Process: ALSA capture silence detection 2.5x gain Opus encode * Processing pipeline: ALSA capture silence detection discontinuity detection 2.5x gain Opus encode
* @return >0 = Opus bytes, 0 = silence/no data, -1 = error * @param opus_buf Output buffer for encoded Opus packet
* @return >0 = Opus packet size in bytes, 0 = silence/no data, -1 = error
*/ */
__attribute__((hot)) int jetkvm_audio_read_encode(void * __restrict__ opus_buf) { __attribute__((hot)) int jetkvm_audio_read_encode(void * __restrict__ opus_buf) {
static short SIMD_ALIGN pcm_buffer[1920]; // Static buffers persist across calls for better cache locality
static short prev_max_sample = 0; // Track previous frame's peak for discontinuity detection static short SIMD_ALIGN pcm_buffer[1920]; // 960 frames × 2 channels
unsigned char * __restrict__ out = (unsigned char*)opus_buf; static short prev_max_sample = 0; // Previous frame peak for discontinuity detection
SIMD_PREFETCH(out, 1, 3); // Local variables
SIMD_PREFETCH(pcm_buffer, 0, 3); unsigned char * __restrict__ out = (unsigned char*)opus_buf;
int pcm_rc;
int err = 0; int err = 0;
int recovery_attempts = 0; int recovery_attempts = 0;
const int max_recovery_attempts = 3; const int max_recovery_attempts = 3;
int total_samples;
short max_sample;
int nb_bytes;
// Prefetch output buffer for write
SIMD_PREFETCH(out, 1, 3);
SIMD_PREFETCH(pcm_buffer, 0, 3);
if (__builtin_expect(!capture_initialized || !pcm_capture_handle || !encoder || !opus_buf, 0)) { if (__builtin_expect(!capture_initialized || !pcm_capture_handle || !encoder || !opus_buf, 0)) {
if (trace_logging_enabled) { if (trace_logging_enabled) {
@ -603,8 +682,8 @@ __attribute__((hot)) int jetkvm_audio_read_encode(void * __restrict__ opus_buf)
} }
retry_read: retry_read:
; // Read 960 frames (20ms) from ALSA capture device
int pcm_rc = snd_pcm_readi(pcm_capture_handle, pcm_buffer, frame_size); pcm_rc = snd_pcm_readi(pcm_capture_handle, pcm_buffer, frame_size);
if (__builtin_expect(pcm_rc < 0, 0)) { if (__builtin_expect(pcm_rc < 0, 0)) {
if (pcm_rc == -EPIPE) { if (pcm_rc == -EPIPE) {
@ -660,24 +739,26 @@ retry_read:
} }
} }
// Zero-pad if we got a short read
if (__builtin_expect(pcm_rc < frame_size, 0)) { if (__builtin_expect(pcm_rc < frame_size, 0)) {
int remaining_samples = (frame_size - pcm_rc) * channels; int remaining_samples = (frame_size - pcm_rc) * channels;
simd_clear_samples_s16(&pcm_buffer[pcm_rc * channels], remaining_samples); simd_clear_samples_s16(&pcm_buffer[pcm_rc * channels], remaining_samples);
} }
// Silence detection: only skip true silence (< 50 = ~0.15% of max volume) // Silence detection: skip frames below ~0.15% of maximum volume
int total_samples = frame_size * channels; total_samples = frame_size * channels;
short max_sample = simd_find_max_abs_s16(pcm_buffer, total_samples); max_sample = simd_find_max_abs_s16(pcm_buffer, total_samples);
if (max_sample < 50) { if (max_sample < 50) {
prev_max_sample = 0; // Reset on silence prev_max_sample = 0; // Reset discontinuity tracker on silence
if (trace_logging_enabled) { if (trace_logging_enabled) {
printf("[AUDIO_OUTPUT] jetkvm_audio_read_encode: Silence detected (max=%d), skipping frame\n", max_sample); printf("[AUDIO_OUTPUT] jetkvm_audio_read_encode: Silence detected (max=%d), skipping frame\n", max_sample);
} }
return 0; return 0;
} }
// Detect discontinuity (video seek): abrupt level change >5x // Discontinuity detection: reset encoder on abrupt level changes (video seeks)
// Prevents crackling when audio stream jumps due to video seeking
if (prev_max_sample > 0) { if (prev_max_sample > 0) {
int level_ratio = (max_sample > prev_max_sample * 5) || (prev_max_sample > max_sample * 5); int level_ratio = (max_sample > prev_max_sample * 5) || (prev_max_sample > max_sample * 5);
if (level_ratio) { if (level_ratio) {
@ -689,11 +770,12 @@ retry_read:
} }
prev_max_sample = max_sample; prev_max_sample = max_sample;
// Apply moderate 2.5x gain to prevent quantization noise on transients // Apply 2.5x gain boost to prevent quantization noise at low volumes
// Balances between being audible at low volumes and not overdriving at high volumes // HDMI audio typically transmitted at -6 to -12dB; boost prevents Opus noise floor artifacts
simd_scale_volume_s16(pcm_buffer, frame_size * channels, 2.5f); simd_scale_volume_s16(pcm_buffer, frame_size * channels, 2.5f);
int nb_bytes = opus_encode(encoder, pcm_buffer, frame_size, out, max_packet_size); // Encode PCM to Opus (20ms frame → ~200 bytes at 96kbps)
nb_bytes = opus_encode(encoder, pcm_buffer, frame_size, out, max_packet_size);
if (trace_logging_enabled && nb_bytes > 0) { if (trace_logging_enabled && nb_bytes > 0) {
printf("[AUDIO_OUTPUT] jetkvm_audio_read_encode: Successfully encoded %d PCM frames to %d Opus bytes\n", pcm_rc, nb_bytes); printf("[AUDIO_OUTPUT] jetkvm_audio_read_encode: Successfully encoded %d PCM frames to %d Opus bytes\n", pcm_rc, nb_bytes);
@ -767,18 +849,26 @@ int jetkvm_audio_playback_init() {
/** /**
* Decode Opus, write to device speakers (INPUT path hot function) * Decode Opus, write to device speakers (INPUT path hot function)
* Process: Opus decode ALSA write with packet loss concealment * Processing pipeline: Opus decode (with FEC) ALSA playback with error recovery
* @param opus_buf Encoded Opus packet from client
* @param opus_size Size of Opus packet in bytes
* @return >0 = PCM frames written, 0 = frame skipped, -1/-2 = error * @return >0 = PCM frames written, 0 = frame skipped, -1/-2 = error
*/ */
__attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf, int opus_size) { __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf, int opus_size) {
static short __attribute__((aligned(16))) pcm_buffer[1920]; // Static buffer persists across calls for better cache locality
unsigned char * __restrict__ in = (unsigned char*)opus_buf; static short SIMD_ALIGN pcm_buffer[1920]; // 960 frames × 2 channels
SIMD_PREFETCH(in, 0, 3); // Local variables
unsigned char * __restrict__ in = (unsigned char*)opus_buf;
int pcm_frames;
int pcm_rc;
int err = 0; int err = 0;
int recovery_attempts = 0; int recovery_attempts = 0;
const int max_recovery_attempts = 3; const int max_recovery_attempts = 3;
// Prefetch input buffer for read
SIMD_PREFETCH(in, 0, 3);
if (__builtin_expect(!playback_initialized || !pcm_playback_handle || !decoder || !opus_buf || opus_size <= 0, 0)) { if (__builtin_expect(!playback_initialized || !pcm_playback_handle || !decoder || !opus_buf || opus_size <= 0, 0)) {
if (trace_logging_enabled) { if (trace_logging_enabled) {
printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Failed safety checks - playback_initialized=%d, pcm_playback_handle=%p, decoder=%p, opus_buf=%p, opus_size=%d\n", printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Failed safety checks - playback_initialized=%d, pcm_playback_handle=%p, decoder=%p, opus_buf=%p, opus_size=%d\n",
@ -798,13 +888,17 @@ __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf,
printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Processing Opus packet - size=%d bytes\n", opus_size); printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Processing Opus packet - size=%d bytes\n", opus_size);
} }
// Decode normally (FEC is automatically used if available in the packet) // Decode Opus packet to PCM (FEC automatically applied if embedded in packet)
int pcm_frames = opus_decode(decoder, in, opus_size, pcm_buffer, frame_size, 0); // decode_fec=0 means normal decode (FEC data is used automatically when present)
pcm_frames = opus_decode(decoder, in, opus_size, pcm_buffer, frame_size, 0);
if (__builtin_expect(pcm_frames < 0, 0)) { if (__builtin_expect(pcm_frames < 0, 0)) {
// Decode failed - attempt packet loss concealment using FEC from previous packet
if (trace_logging_enabled) { if (trace_logging_enabled) {
printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Opus decode failed with error %d, attempting packet loss concealment\n", pcm_frames); printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Opus decode failed with error %d, attempting packet loss concealment\n", pcm_frames);
} }
// Packet loss concealment: decode using FEC from next packet (if available)
// decode_fec=1 means use FEC data from the NEXT packet to reconstruct THIS lost packet
pcm_frames = opus_decode(decoder, NULL, 0, pcm_buffer, frame_size, 1); pcm_frames = opus_decode(decoder, NULL, 0, pcm_buffer, frame_size, 1);
if (pcm_frames < 0) { if (pcm_frames < 0) {
if (trace_logging_enabled) { if (trace_logging_enabled) {
@ -812,6 +906,7 @@ __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf,
} }
return -1; return -1;
} }
if (trace_logging_enabled) { if (trace_logging_enabled) {
printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Packet loss concealment succeeded, recovered %d frames\n", pcm_frames); printf("[AUDIO_INPUT] jetkvm_audio_decode_write: Packet loss concealment succeeded, recovered %d frames\n", pcm_frames);
} }
@ -820,8 +915,8 @@ __attribute__((hot)) int jetkvm_audio_decode_write(void * __restrict__ opus_buf,
} }
retry_write: retry_write:
; // Write decoded PCM to ALSA playback device
int pcm_rc = snd_pcm_writei(pcm_playback_handle, pcm_buffer, pcm_frames); pcm_rc = snd_pcm_writei(pcm_playback_handle, pcm_buffer, pcm_frames);
if (__builtin_expect(pcm_rc < 0, 0)) { if (__builtin_expect(pcm_rc < 0, 0)) {
if (trace_logging_enabled) { if (trace_logging_enabled) {
printf("[AUDIO_INPUT] jetkvm_audio_decode_write: ALSA write failed with error %d (%s), attempt %d/%d\n", printf("[AUDIO_INPUT] jetkvm_audio_decode_write: ALSA write failed with error %d (%s), attempt %d/%d\n",