diff --git a/internal/audio/c/audio.c b/internal/audio/c/audio.c
index fded47bf..957004f5 100644
--- a/internal/audio/c/audio.c
+++ b/internal/audio/c/audio.c
@@ -166,25 +166,23 @@ static inline void simd_interleave_stereo_s16(const short *left, const short *ri
 
 /**
  * Apply gain using NEON Q15 fixed-point math (8 samples/iteration)
+ * Uses vqrdmulhq_s16: single-instruction saturating rounded multiply-high
  */
 static inline void simd_scale_volume_s16(short *samples, int count, float volume) {
     simd_init_once();
-    int16_t vol_fixed = (int16_t)(volume * 32767.0f);
+    // For vqrdmulhq_s16, multiply volume by 2 since it extracts bits [30:15] not [31:16]
+    int16_t vol_fixed = (int16_t)(volume * 16384.0f);
     int16x8_t vol_vec = vdupq_n_s16(vol_fixed);
     int simd_count = count & ~7;
 
     for (int i = 0; i < simd_count; i += 8) {
         int16x8_t samples_vec = vld1q_s16(&samples[i]);
-        int32x4_t low_result = vmull_s16(vget_low_s16(samples_vec), vget_low_s16(vol_vec));
-        int32x4_t high_result = vmull_s16(vget_high_s16(samples_vec), vget_high_s16(vol_vec));
-        int16x4_t low_narrow = vshrn_n_s32(low_result, 15);
-        int16x4_t high_narrow = vshrn_n_s32(high_result, 15);
-        int16x8_t result = vcombine_s16(low_narrow, high_narrow);
+        int16x8_t result = vqrdmulhq_s16(samples_vec, vol_vec);
         vst1q_s16(&samples[i], result);
     }
 
     for (int i = simd_count; i < count; i++) {
-        samples[i] = (short)((samples[i] * vol_fixed) >> 15);
+        samples[i] = (short)((samples[i] * vol_fixed) >> 14);
     }
 }