feat(audio): enhance error handling and add device health monitoring

- Implement robust error recovery with progressive backoff in audio streaming
- Add comprehensive device health monitoring system
- Improve ALSA device handling with enhanced retry logic
- Refactor IPC message handling to use shared pools
- Add validation utilities for audio frames and configuration
- Introduce atomic utilities for thread-safe metrics tracking
- Update latency histogram to use configurable buckets
- Add documentation for new metrics and configuration options
This commit is contained in:
Alex P 2025-08-26 12:51:11 +00:00
parent e4ed2b8fad
commit b1f85db7de
11 changed files with 1469 additions and 220 deletions

View File

@ -45,7 +45,7 @@ func DefaultOptimizerConfig() OptimizerConfig {
CooldownPeriod: GetConfig().CooldownPeriod, CooldownPeriod: GetConfig().CooldownPeriod,
Aggressiveness: GetConfig().OptimizerAggressiveness, Aggressiveness: GetConfig().OptimizerAggressiveness,
RollbackThreshold: GetConfig().RollbackThreshold, RollbackThreshold: GetConfig().RollbackThreshold,
StabilityPeriod: 10 * time.Second, StabilityPeriod: GetConfig().AdaptiveOptimizerStability,
} }
} }

View File

@ -0,0 +1,204 @@
package audio
import (
"sync/atomic"
"time"
)
// AtomicCounter provides thread-safe counter operations
type AtomicCounter struct {
value int64
}
// NewAtomicCounter creates a new atomic counter
func NewAtomicCounter() *AtomicCounter {
return &AtomicCounter{}
}
// Add atomically adds delta to the counter and returns the new value
func (c *AtomicCounter) Add(delta int64) int64 {
return atomic.AddInt64(&c.value, delta)
}
// Increment atomically increments the counter by 1
func (c *AtomicCounter) Increment() int64 {
return atomic.AddInt64(&c.value, 1)
}
// Load atomically loads the counter value
func (c *AtomicCounter) Load() int64 {
return atomic.LoadInt64(&c.value)
}
// Store atomically stores a new value
func (c *AtomicCounter) Store(value int64) {
atomic.StoreInt64(&c.value, value)
}
// Reset atomically resets the counter to zero
func (c *AtomicCounter) Reset() {
atomic.StoreInt64(&c.value, 0)
}
// Swap atomically swaps the value and returns the old value
func (c *AtomicCounter) Swap(new int64) int64 {
return atomic.SwapInt64(&c.value, new)
}
// FrameMetrics provides common frame tracking metrics
type FrameMetrics struct {
Total *AtomicCounter
Dropped *AtomicCounter
Bytes *AtomicCounter
}
// NewFrameMetrics creates a new frame metrics tracker
func NewFrameMetrics() *FrameMetrics {
return &FrameMetrics{
Total: NewAtomicCounter(),
Dropped: NewAtomicCounter(),
Bytes: NewAtomicCounter(),
}
}
// RecordFrame atomically records a successful frame with its size
func (fm *FrameMetrics) RecordFrame(size int64) {
fm.Total.Increment()
fm.Bytes.Add(size)
}
// RecordDrop atomically records a dropped frame
func (fm *FrameMetrics) RecordDrop() {
fm.Dropped.Increment()
}
// GetStats returns current metrics values
func (fm *FrameMetrics) GetStats() (total, dropped, bytes int64) {
return fm.Total.Load(), fm.Dropped.Load(), fm.Bytes.Load()
}
// Reset resets all metrics to zero
func (fm *FrameMetrics) Reset() {
fm.Total.Reset()
fm.Dropped.Reset()
fm.Bytes.Reset()
}
// GetDropRate calculates the drop rate as a percentage
func (fm *FrameMetrics) GetDropRate() float64 {
total := fm.Total.Load()
if total == 0 {
return 0.0
}
dropped := fm.Dropped.Load()
return float64(dropped) / float64(total) * 100.0
}
// LatencyTracker provides atomic latency tracking
type LatencyTracker struct {
current *AtomicCounter
min *AtomicCounter
max *AtomicCounter
average *AtomicCounter
samples *AtomicCounter
}
// NewLatencyTracker creates a new latency tracker
func NewLatencyTracker() *LatencyTracker {
lt := &LatencyTracker{
current: NewAtomicCounter(),
min: NewAtomicCounter(),
max: NewAtomicCounter(),
average: NewAtomicCounter(),
samples: NewAtomicCounter(),
}
// Initialize min to max value so first measurement sets it properly
lt.min.Store(int64(^uint64(0) >> 1)) // Max int64
return lt
}
// RecordLatency atomically records a new latency measurement
func (lt *LatencyTracker) RecordLatency(latency time.Duration) {
latencyNanos := latency.Nanoseconds()
lt.current.Store(latencyNanos)
lt.samples.Increment()
// Update min
for {
oldMin := lt.min.Load()
if latencyNanos >= oldMin {
break
}
if atomic.CompareAndSwapInt64(&lt.min.value, oldMin, latencyNanos) {
break
}
}
// Update max
for {
oldMax := lt.max.Load()
if latencyNanos <= oldMax {
break
}
if atomic.CompareAndSwapInt64(&lt.max.value, oldMax, latencyNanos) {
break
}
}
// Update average using exponential moving average
oldAvg := lt.average.Load()
newAvg := (oldAvg*7 + latencyNanos) / 8 // 87.5% weight to old average
lt.average.Store(newAvg)
}
// GetLatencyStats returns current latency statistics
func (lt *LatencyTracker) GetLatencyStats() (current, min, max, average time.Duration, samples int64) {
return time.Duration(lt.current.Load()),
time.Duration(lt.min.Load()),
time.Duration(lt.max.Load()),
time.Duration(lt.average.Load()),
lt.samples.Load()
}
// PoolMetrics provides common pool performance metrics
type PoolMetrics struct {
Hits *AtomicCounter
Misses *AtomicCounter
}
// NewPoolMetrics creates a new pool metrics tracker
func NewPoolMetrics() *PoolMetrics {
return &PoolMetrics{
Hits: NewAtomicCounter(),
Misses: NewAtomicCounter(),
}
}
// RecordHit atomically records a pool hit
func (pm *PoolMetrics) RecordHit() {
pm.Hits.Increment()
}
// RecordMiss atomically records a pool miss
func (pm *PoolMetrics) RecordMiss() {
pm.Misses.Increment()
}
// GetHitRate calculates the hit rate as a percentage
func (pm *PoolMetrics) GetHitRate() float64 {
hits := pm.Hits.Load()
misses := pm.Misses.Load()
total := hits + misses
if total == 0 {
return 0.0
}
return float64(hits) / float64(total) * 100.0
}
// GetStats returns hit and miss counts
func (pm *PoolMetrics) GetStats() (hits, misses int64, hitRate float64) {
hits = pm.Hits.Load()
misses = pm.Misses.Load()
hitRate = pm.GetHitRate()
return
}

View File

@ -61,12 +61,15 @@ static volatile int capture_initialized = 0;
static volatile int playback_initializing = 0; static volatile int playback_initializing = 0;
static volatile int playback_initialized = 0; static volatile int playback_initialized = 0;
// Safe ALSA device opening with retry logic // Enhanced ALSA device opening with exponential backoff retry logic
static int safe_alsa_open(snd_pcm_t **handle, const char *device, snd_pcm_stream_t stream) { static int safe_alsa_open(snd_pcm_t **handle, const char *device, snd_pcm_stream_t stream) {
int attempts = 3; int max_attempts = 5; // Increased from 3 to 5
int attempt = 0;
int err; int err;
int backoff_us = sleep_microseconds; // Start with base sleep time
const int max_backoff_us = 500000; // Max 500ms backoff
while (attempts-- > 0) { while (attempt < max_attempts) {
err = snd_pcm_open(handle, device, stream, SND_PCM_NONBLOCK); err = snd_pcm_open(handle, device, stream, SND_PCM_NONBLOCK);
if (err >= 0) { if (err >= 0) {
// Switch to blocking mode after successful open // Switch to blocking mode after successful open
@ -74,12 +77,26 @@ static int safe_alsa_open(snd_pcm_t **handle, const char *device, snd_pcm_stream
return 0; return 0;
} }
if (err == -EBUSY && attempts > 0) { attempt++;
// Device busy, wait and retry if (attempt >= max_attempts) break;
usleep(sleep_microseconds); // 50ms
continue; // Enhanced error handling with specific retry strategies
if (err == -EBUSY || err == -EAGAIN) {
// Device busy or temporarily unavailable - retry with backoff
usleep(backoff_us);
backoff_us = (backoff_us * 2 < max_backoff_us) ? backoff_us * 2 : max_backoff_us;
} else if (err == -ENODEV || err == -ENOENT) {
// Device not found - longer wait as device might be initializing
usleep(backoff_us * 2);
backoff_us = (backoff_us * 2 < max_backoff_us) ? backoff_us * 2 : max_backoff_us;
} else if (err == -EPERM || err == -EACCES) {
// Permission denied - shorter wait, likely persistent issue
usleep(backoff_us / 2);
} else {
// Other errors - standard backoff
usleep(backoff_us);
backoff_us = (backoff_us * 2 < max_backoff_us) ? backoff_us * 2 : max_backoff_us;
} }
break;
} }
return err; return err;
} }
@ -217,43 +234,90 @@ int jetkvm_audio_init() {
return 0; return 0;
} }
// Read and encode one frame with enhanced error handling // Read and encode one frame with robust error handling and recovery
int jetkvm_audio_read_encode(void *opus_buf) { int jetkvm_audio_read_encode(void *opus_buf) {
short pcm_buffer[1920]; // max 2ch*960 short pcm_buffer[1920]; // max 2ch*960
unsigned char *out = (unsigned char*)opus_buf; unsigned char *out = (unsigned char*)opus_buf;
int err = 0; int err = 0;
int recovery_attempts = 0;
const int max_recovery_attempts = 3;
// Safety checks // Safety checks
if (!capture_initialized || !pcm_handle || !encoder || !opus_buf) { if (!capture_initialized || !pcm_handle || !encoder || !opus_buf) {
return -1; return -1;
} }
retry_read:
;
int pcm_rc = snd_pcm_readi(pcm_handle, pcm_buffer, frame_size); int pcm_rc = snd_pcm_readi(pcm_handle, pcm_buffer, frame_size);
// Handle ALSA errors with enhanced recovery // Handle ALSA errors with robust recovery strategies
if (pcm_rc < 0) { if (pcm_rc < 0) {
if (pcm_rc == -EPIPE) { if (pcm_rc == -EPIPE) {
// Buffer underrun - try to recover // Buffer underrun - implement progressive recovery
recovery_attempts++;
if (recovery_attempts > max_recovery_attempts) {
return -1; // Give up after max attempts
}
// Try to recover with prepare
err = snd_pcm_prepare(pcm_handle);
if (err < 0) {
// If prepare fails, try drop and prepare
snd_pcm_drop(pcm_handle);
err = snd_pcm_prepare(pcm_handle); err = snd_pcm_prepare(pcm_handle);
if (err < 0) return -1; if (err < 0) return -1;
}
pcm_rc = snd_pcm_readi(pcm_handle, pcm_buffer, frame_size); // Wait before retry to allow device to stabilize
if (pcm_rc < 0) return -1; usleep(sleep_microseconds * recovery_attempts);
goto retry_read;
} else if (pcm_rc == -EAGAIN) { } else if (pcm_rc == -EAGAIN) {
// No data available - return 0 to indicate no frame // No data available - return 0 to indicate no frame
return 0; return 0;
} else if (pcm_rc == -ESTRPIPE) { } else if (pcm_rc == -ESTRPIPE) {
// Device suspended, try to resume // Device suspended, implement robust resume logic
while ((err = snd_pcm_resume(pcm_handle)) == -EAGAIN) { recovery_attempts++;
usleep(sleep_microseconds); // Use centralized constant if (recovery_attempts > max_recovery_attempts) {
return -1;
}
// Try to resume with timeout
int resume_attempts = 0;
while ((err = snd_pcm_resume(pcm_handle)) == -EAGAIN && resume_attempts < 10) {
usleep(sleep_microseconds);
resume_attempts++;
} }
if (err < 0) { if (err < 0) {
// Resume failed, try prepare as fallback
err = snd_pcm_prepare(pcm_handle); err = snd_pcm_prepare(pcm_handle);
if (err < 0) return -1; if (err < 0) return -1;
} }
return 0; // Skip this frame // Wait before retry to allow device to stabilize
usleep(sleep_microseconds * recovery_attempts);
return 0; // Skip this frame but don't fail
} else if (pcm_rc == -ENODEV) {
// Device disconnected - critical error
return -1;
} else if (pcm_rc == -EIO) {
// I/O error - try recovery once
recovery_attempts++;
if (recovery_attempts <= max_recovery_attempts) {
snd_pcm_drop(pcm_handle);
err = snd_pcm_prepare(pcm_handle);
if (err >= 0) {
usleep(sleep_microseconds);
goto retry_read;
}
}
return -1;
} else { } else {
// Other error - return error code // Other errors - limited retry for transient issues
recovery_attempts++;
if (recovery_attempts <= 1 && (pcm_rc == -EINTR || pcm_rc == -EBUSY)) {
usleep(sleep_microseconds / 2);
goto retry_read;
}
return -1; return -1;
} }
} }
@ -327,11 +391,13 @@ int jetkvm_audio_playback_init() {
return 0; return 0;
} }
// Decode Opus and write PCM with enhanced error handling // Decode Opus and write PCM with robust error handling and recovery
int jetkvm_audio_decode_write(void *opus_buf, int opus_size) { int jetkvm_audio_decode_write(void *opus_buf, int opus_size) {
short pcm_buffer[1920]; // max 2ch*960 short pcm_buffer[1920]; // max 2ch*960
unsigned char *in = (unsigned char*)opus_buf; unsigned char *in = (unsigned char*)opus_buf;
int err = 0; int err = 0;
int recovery_attempts = 0;
const int max_recovery_attempts = 3;
// Safety checks // Safety checks
if (!playback_initialized || !pcm_playback_handle || !decoder || !opus_buf || opus_size <= 0) { if (!playback_initialized || !pcm_playback_handle || !decoder || !opus_buf || opus_size <= 0) {
@ -343,31 +409,91 @@ int jetkvm_audio_decode_write(void *opus_buf, int opus_size) {
return -1; return -1;
} }
// Decode Opus to PCM // Decode Opus to PCM with error handling
int pcm_frames = opus_decode(decoder, in, opus_size, pcm_buffer, frame_size, 0); int pcm_frames = opus_decode(decoder, in, opus_size, pcm_buffer, frame_size, 0);
if (pcm_frames < 0) {
// Try packet loss concealment on decode error
pcm_frames = opus_decode(decoder, NULL, 0, pcm_buffer, frame_size, 0);
if (pcm_frames < 0) return -1; if (pcm_frames < 0) return -1;
}
// Write PCM to playback device with enhanced recovery retry_write:
;
// Write PCM to playback device with robust recovery
int pcm_rc = snd_pcm_writei(pcm_playback_handle, pcm_buffer, pcm_frames); int pcm_rc = snd_pcm_writei(pcm_playback_handle, pcm_buffer, pcm_frames);
if (pcm_rc < 0) { if (pcm_rc < 0) {
if (pcm_rc == -EPIPE) { if (pcm_rc == -EPIPE) {
// Buffer underrun - try to recover // Buffer underrun - implement progressive recovery
recovery_attempts++;
if (recovery_attempts > max_recovery_attempts) {
return -2;
}
// Try to recover with prepare
err = snd_pcm_prepare(pcm_playback_handle);
if (err < 0) {
// If prepare fails, try drop and prepare
snd_pcm_drop(pcm_playback_handle);
err = snd_pcm_prepare(pcm_playback_handle); err = snd_pcm_prepare(pcm_playback_handle);
if (err < 0) return -2; if (err < 0) return -2;
}
pcm_rc = snd_pcm_writei(pcm_playback_handle, pcm_buffer, pcm_frames); // Wait before retry to allow device to stabilize
usleep(sleep_microseconds * recovery_attempts);
goto retry_write;
} else if (pcm_rc == -ESTRPIPE) { } else if (pcm_rc == -ESTRPIPE) {
// Device suspended, try to resume // Device suspended, implement robust resume logic
while ((err = snd_pcm_resume(pcm_playback_handle)) == -EAGAIN) { recovery_attempts++;
usleep(sleep_microseconds); // Use centralized constant if (recovery_attempts > max_recovery_attempts) {
return -2;
}
// Try to resume with timeout
int resume_attempts = 0;
while ((err = snd_pcm_resume(pcm_playback_handle)) == -EAGAIN && resume_attempts < 10) {
usleep(sleep_microseconds);
resume_attempts++;
} }
if (err < 0) { if (err < 0) {
// Resume failed, try prepare as fallback
err = snd_pcm_prepare(pcm_playback_handle); err = snd_pcm_prepare(pcm_playback_handle);
if (err < 0) return -2; if (err < 0) return -2;
} }
return 0; // Skip this frame // Wait before retry to allow device to stabilize
usleep(sleep_microseconds * recovery_attempts);
return 0; // Skip this frame but don't fail
} else if (pcm_rc == -ENODEV) {
// Device disconnected - critical error
return -2;
} else if (pcm_rc == -EIO) {
// I/O error - try recovery once
recovery_attempts++;
if (recovery_attempts <= max_recovery_attempts) {
snd_pcm_drop(pcm_playback_handle);
err = snd_pcm_prepare(pcm_playback_handle);
if (err >= 0) {
usleep(sleep_microseconds);
goto retry_write;
}
}
return -2;
} else if (pcm_rc == -EAGAIN) {
// Device not ready - brief wait and retry
recovery_attempts++;
if (recovery_attempts <= max_recovery_attempts) {
usleep(sleep_microseconds / 4);
goto retry_write;
}
return -2;
} else {
// Other errors - limited retry for transient issues
recovery_attempts++;
if (recovery_attempts <= 1 && (pcm_rc == -EINTR || pcm_rc == -EBUSY)) {
usleep(sleep_microseconds / 2);
goto retry_write;
}
return -2;
} }
if (pcm_rc < 0) return -2;
} }
return pcm_frames; return pcm_frames;

View File

@ -1540,6 +1540,49 @@ type AudioConfigConstants struct {
// Impact: Prevents excessive channel counts that could impact performance. // Impact: Prevents excessive channel counts that could impact performance.
// Default 8 channels provides reasonable upper bound for multi-channel audio. // Default 8 channels provides reasonable upper bound for multi-channel audio.
MaxChannels int MaxChannels int
// Device Health Monitoring Configuration
// Used in: device_health.go for proactive device monitoring and recovery
// Impact: Controls health check frequency and recovery thresholds
// HealthCheckIntervalMS defines interval between device health checks in milliseconds.
// Used in: DeviceHealthMonitor for periodic health assessment
// Impact: Lower values provide faster detection but increase CPU usage.
// Default 5000ms (5s) provides good balance between responsiveness and overhead.
HealthCheckIntervalMS int
// HealthRecoveryThreshold defines number of consecutive successful operations
// required to mark a device as healthy after being unhealthy.
// Used in: DeviceHealthMonitor for recovery state management
// Impact: Higher values prevent premature recovery declarations.
// Default 3 consecutive successes ensures stable recovery.
HealthRecoveryThreshold int
// HealthLatencyThresholdMS defines maximum acceptable latency in milliseconds
// before considering a device unhealthy.
// Used in: DeviceHealthMonitor for latency-based health assessment
// Impact: Lower values trigger recovery sooner but may cause false positives.
// Default 100ms provides reasonable threshold for real-time audio.
HealthLatencyThresholdMS int
// HealthErrorRateLimit defines maximum error rate (0.0-1.0) before
// considering a device unhealthy.
// Used in: DeviceHealthMonitor for error rate assessment
// Impact: Lower values trigger recovery sooner for error-prone devices.
// Default 0.1 (10%) allows some transient errors while detecting problems.
HealthErrorRateLimit float64
// Latency Histogram Bucket Configuration
// Used in: LatencyHistogram for granular latency measurement buckets
// Impact: Defines the boundaries for latency distribution analysis
LatencyBucket10ms time.Duration // 10ms latency bucket
LatencyBucket25ms time.Duration // 25ms latency bucket
LatencyBucket50ms time.Duration // 50ms latency bucket
LatencyBucket100ms time.Duration // 100ms latency bucket
LatencyBucket250ms time.Duration // 250ms latency bucket
LatencyBucket500ms time.Duration // 500ms latency bucket
LatencyBucket1s time.Duration // 1s latency bucket
LatencyBucket2s time.Duration // 2s latency bucket
} }
// DefaultAudioConfig returns the default configuration constants // DefaultAudioConfig returns the default configuration constants
@ -2563,6 +2606,22 @@ func DefaultAudioConfig() *AudioConfigConstants {
MinSampleRate: 8000, // 8kHz minimum sample rate MinSampleRate: 8000, // 8kHz minimum sample rate
MaxSampleRate: 48000, // 48kHz maximum sample rate MaxSampleRate: 48000, // 48kHz maximum sample rate
MaxChannels: 8, // 8 maximum audio channels MaxChannels: 8, // 8 maximum audio channels
// Device Health Monitoring Configuration
HealthCheckIntervalMS: 5000, // 5000ms (5s) health check interval
HealthRecoveryThreshold: 3, // 3 consecutive successes for recovery
HealthLatencyThresholdMS: 100, // 100ms latency threshold for health
HealthErrorRateLimit: 0.1, // 10% error rate limit for health
// Latency Histogram Bucket Configuration
LatencyBucket10ms: 10 * time.Millisecond, // 10ms latency bucket
LatencyBucket25ms: 25 * time.Millisecond, // 25ms latency bucket
LatencyBucket50ms: 50 * time.Millisecond, // 50ms latency bucket
LatencyBucket100ms: 100 * time.Millisecond, // 100ms latency bucket
LatencyBucket250ms: 250 * time.Millisecond, // 250ms latency bucket
LatencyBucket500ms: 500 * time.Millisecond, // 500ms latency bucket
LatencyBucket1s: 1 * time.Second, // 1s latency bucket
LatencyBucket2s: 2 * time.Second, // 2s latency bucket
} }
} }

View File

@ -0,0 +1,514 @@
package audio
import (
"context"
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/jetkvm/kvm/internal/logging"
"github.com/rs/zerolog"
)
// DeviceHealthStatus represents the health status of an audio device
type DeviceHealthStatus int
const (
DeviceHealthUnknown DeviceHealthStatus = iota
DeviceHealthHealthy
DeviceHealthDegraded
DeviceHealthFailing
DeviceHealthCritical
)
func (s DeviceHealthStatus) String() string {
switch s {
case DeviceHealthHealthy:
return "healthy"
case DeviceHealthDegraded:
return "degraded"
case DeviceHealthFailing:
return "failing"
case DeviceHealthCritical:
return "critical"
default:
return "unknown"
}
}
// DeviceHealthMetrics tracks health-related metrics for audio devices
type DeviceHealthMetrics struct {
// Error tracking
ConsecutiveErrors int64 `json:"consecutive_errors"`
TotalErrors int64 `json:"total_errors"`
LastErrorTime time.Time `json:"last_error_time"`
ErrorRate float64 `json:"error_rate"` // errors per minute
// Performance metrics
AverageLatency time.Duration `json:"average_latency"`
MaxLatency time.Duration `json:"max_latency"`
LatencySpikes int64 `json:"latency_spikes"`
Underruns int64 `json:"underruns"`
Overruns int64 `json:"overruns"`
// Device availability
LastSuccessfulOp time.Time `json:"last_successful_op"`
DeviceDisconnects int64 `json:"device_disconnects"`
RecoveryAttempts int64 `json:"recovery_attempts"`
SuccessfulRecoveries int64 `json:"successful_recoveries"`
// Health assessment
CurrentStatus DeviceHealthStatus `json:"current_status"`
StatusLastChanged time.Time `json:"status_last_changed"`
HealthScore float64 `json:"health_score"` // 0.0 to 1.0
}
// DeviceHealthMonitor monitors the health of audio devices and triggers recovery
type DeviceHealthMonitor struct {
// Atomic fields first for ARM32 alignment
running int32
monitoringEnabled int32
// Configuration
checkInterval time.Duration
recoveryThreshold int
latencyThreshold time.Duration
errorRateLimit float64 // max errors per minute
// State tracking
captureMetrics *DeviceHealthMetrics
playbackMetrics *DeviceHealthMetrics
mutex sync.RWMutex
// Control channels
ctx context.Context
cancel context.CancelFunc
stopChan chan struct{}
doneChan chan struct{}
// Recovery callbacks
recoveryCallbacks map[string]func() error
callbackMutex sync.RWMutex
// Logging
logger zerolog.Logger
config *AudioConfigConstants
}
// NewDeviceHealthMonitor creates a new device health monitor
func NewDeviceHealthMonitor() *DeviceHealthMonitor {
ctx, cancel := context.WithCancel(context.Background())
config := GetConfig()
return &DeviceHealthMonitor{
checkInterval: time.Duration(config.HealthCheckIntervalMS) * time.Millisecond,
recoveryThreshold: config.HealthRecoveryThreshold,
latencyThreshold: time.Duration(config.HealthLatencyThresholdMS) * time.Millisecond,
errorRateLimit: config.HealthErrorRateLimit,
captureMetrics: &DeviceHealthMetrics{
CurrentStatus: DeviceHealthUnknown,
HealthScore: 1.0,
},
playbackMetrics: &DeviceHealthMetrics{
CurrentStatus: DeviceHealthUnknown,
HealthScore: 1.0,
},
ctx: ctx,
cancel: cancel,
stopChan: make(chan struct{}),
doneChan: make(chan struct{}),
recoveryCallbacks: make(map[string]func() error),
logger: logging.GetDefaultLogger().With().Str("component", "device-health-monitor").Logger(),
config: config,
}
}
// Start begins health monitoring
func (dhm *DeviceHealthMonitor) Start() error {
if !atomic.CompareAndSwapInt32(&dhm.running, 0, 1) {
return fmt.Errorf("device health monitor already running")
}
dhm.logger.Info().Msg("starting device health monitor")
atomic.StoreInt32(&dhm.monitoringEnabled, 1)
go dhm.monitoringLoop()
return nil
}
// Stop stops health monitoring
func (dhm *DeviceHealthMonitor) Stop() {
if !atomic.CompareAndSwapInt32(&dhm.running, 1, 0) {
return
}
dhm.logger.Info().Msg("stopping device health monitor")
atomic.StoreInt32(&dhm.monitoringEnabled, 0)
close(dhm.stopChan)
dhm.cancel()
// Wait for monitoring loop to finish
select {
case <-dhm.doneChan:
dhm.logger.Info().Msg("device health monitor stopped")
case <-time.After(time.Duration(dhm.config.SupervisorTimeout)):
dhm.logger.Warn().Msg("device health monitor stop timeout")
}
}
// RegisterRecoveryCallback registers a recovery function for a specific component
func (dhm *DeviceHealthMonitor) RegisterRecoveryCallback(component string, callback func() error) {
dhm.callbackMutex.Lock()
defer dhm.callbackMutex.Unlock()
dhm.recoveryCallbacks[component] = callback
dhm.logger.Info().Str("component", component).Msg("registered recovery callback")
}
// RecordError records an error for health tracking
func (dhm *DeviceHealthMonitor) RecordError(deviceType string, err error) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
dhm.logger.Warn().Str("device_type", deviceType).Msg("unknown device type for error recording")
return
}
atomic.AddInt64(&metrics.ConsecutiveErrors, 1)
atomic.AddInt64(&metrics.TotalErrors, 1)
metrics.LastErrorTime = time.Now()
// Update error rate (errors per minute)
if !metrics.LastErrorTime.IsZero() {
timeSinceFirst := time.Since(metrics.LastErrorTime)
if timeSinceFirst > 0 {
metrics.ErrorRate = float64(metrics.TotalErrors) / timeSinceFirst.Minutes()
}
}
dhm.logger.Debug().
Str("device_type", deviceType).
Err(err).
Int64("consecutive_errors", metrics.ConsecutiveErrors).
Float64("error_rate", metrics.ErrorRate).
Msg("recorded device error")
// Trigger immediate health assessment
dhm.assessDeviceHealth(deviceType, metrics)
}
// RecordSuccess records a successful operation
func (dhm *DeviceHealthMonitor) RecordSuccess(deviceType string) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
return
}
// Reset consecutive errors on success
atomic.StoreInt64(&metrics.ConsecutiveErrors, 0)
metrics.LastSuccessfulOp = time.Now()
// Improve health score gradually
if metrics.HealthScore < 1.0 {
metrics.HealthScore = min(1.0, metrics.HealthScore+0.1)
}
}
// RecordLatency records operation latency for health assessment
func (dhm *DeviceHealthMonitor) RecordLatency(deviceType string, latency time.Duration) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
return
}
// Update latency metrics
if metrics.AverageLatency == 0 {
metrics.AverageLatency = latency
} else {
// Exponential moving average
metrics.AverageLatency = time.Duration(float64(metrics.AverageLatency)*0.9 + float64(latency)*0.1)
}
if latency > metrics.MaxLatency {
metrics.MaxLatency = latency
}
// Track latency spikes
if latency > dhm.latencyThreshold {
atomic.AddInt64(&metrics.LatencySpikes, 1)
}
}
// RecordUnderrun records an audio underrun event
func (dhm *DeviceHealthMonitor) RecordUnderrun(deviceType string) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
return
}
atomic.AddInt64(&metrics.Underruns, 1)
dhm.logger.Debug().Str("device_type", deviceType).Msg("recorded audio underrun")
}
// RecordOverrun records an audio overrun event
func (dhm *DeviceHealthMonitor) RecordOverrun(deviceType string) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
return
}
atomic.AddInt64(&metrics.Overruns, 1)
dhm.logger.Debug().Str("device_type", deviceType).Msg("recorded audio overrun")
}
// GetHealthMetrics returns current health metrics
func (dhm *DeviceHealthMonitor) GetHealthMetrics() (capture, playback DeviceHealthMetrics) {
dhm.mutex.RLock()
defer dhm.mutex.RUnlock()
return *dhm.captureMetrics, *dhm.playbackMetrics
}
// monitoringLoop runs the main health monitoring loop
func (dhm *DeviceHealthMonitor) monitoringLoop() {
defer close(dhm.doneChan)
ticker := time.NewTicker(dhm.checkInterval)
defer ticker.Stop()
for {
select {
case <-dhm.stopChan:
return
case <-dhm.ctx.Done():
return
case <-ticker.C:
dhm.performHealthCheck()
}
}
}
// performHealthCheck performs a comprehensive health check
func (dhm *DeviceHealthMonitor) performHealthCheck() {
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
// Assess health for both devices
dhm.assessDeviceHealth("capture", dhm.captureMetrics)
dhm.assessDeviceHealth("playback", dhm.playbackMetrics)
// Check if recovery is needed
dhm.checkRecoveryNeeded("capture", dhm.captureMetrics)
dhm.checkRecoveryNeeded("playback", dhm.playbackMetrics)
}
// assessDeviceHealth assesses the health status of a device
func (dhm *DeviceHealthMonitor) assessDeviceHealth(deviceType string, metrics *DeviceHealthMetrics) {
previousStatus := metrics.CurrentStatus
newStatus := dhm.calculateHealthStatus(metrics)
if newStatus != previousStatus {
metrics.CurrentStatus = newStatus
metrics.StatusLastChanged = time.Now()
dhm.logger.Info().
Str("device_type", deviceType).
Str("previous_status", previousStatus.String()).
Str("new_status", newStatus.String()).
Float64("health_score", metrics.HealthScore).
Msg("device health status changed")
}
// Update health score
metrics.HealthScore = dhm.calculateHealthScore(metrics)
}
// calculateHealthStatus determines health status based on metrics
func (dhm *DeviceHealthMonitor) calculateHealthStatus(metrics *DeviceHealthMetrics) DeviceHealthStatus {
consecutiveErrors := atomic.LoadInt64(&metrics.ConsecutiveErrors)
totalErrors := atomic.LoadInt64(&metrics.TotalErrors)
// Critical: Too many consecutive errors or device disconnected recently
if consecutiveErrors >= int64(dhm.recoveryThreshold) {
return DeviceHealthCritical
}
// Critical: No successful operations in a long time
if !metrics.LastSuccessfulOp.IsZero() && time.Since(metrics.LastSuccessfulOp) > time.Duration(dhm.config.SupervisorTimeout) {
return DeviceHealthCritical
}
// Failing: High error rate or frequent latency spikes
if metrics.ErrorRate > dhm.errorRateLimit || atomic.LoadInt64(&metrics.LatencySpikes) > int64(dhm.config.MaxDroppedFrames) {
return DeviceHealthFailing
}
// Degraded: Some errors or performance issues
if consecutiveErrors > 0 || totalErrors > int64(dhm.config.MaxDroppedFrames/2) || metrics.AverageLatency > dhm.latencyThreshold {
return DeviceHealthDegraded
}
// Healthy: No significant issues
return DeviceHealthHealthy
}
// calculateHealthScore calculates a numeric health score (0.0 to 1.0)
func (dhm *DeviceHealthMonitor) calculateHealthScore(metrics *DeviceHealthMetrics) float64 {
score := 1.0
// Penalize consecutive errors
consecutiveErrors := atomic.LoadInt64(&metrics.ConsecutiveErrors)
if consecutiveErrors > 0 {
score -= float64(consecutiveErrors) * 0.1
}
// Penalize high error rate
if metrics.ErrorRate > 0 {
score -= min(0.5, metrics.ErrorRate/dhm.errorRateLimit*0.5)
}
// Penalize high latency
if metrics.AverageLatency > dhm.latencyThreshold {
excess := float64(metrics.AverageLatency-dhm.latencyThreshold) / float64(dhm.latencyThreshold)
score -= min(0.3, excess*0.3)
}
// Penalize underruns/overruns
underruns := atomic.LoadInt64(&metrics.Underruns)
overruns := atomic.LoadInt64(&metrics.Overruns)
if underruns+overruns > 0 {
score -= min(0.2, float64(underruns+overruns)*0.01)
}
return max(0.0, score)
}
// checkRecoveryNeeded checks if recovery is needed and triggers it
func (dhm *DeviceHealthMonitor) checkRecoveryNeeded(deviceType string, metrics *DeviceHealthMetrics) {
if metrics.CurrentStatus == DeviceHealthCritical {
dhm.triggerRecovery(deviceType, metrics)
}
}
// triggerRecovery triggers recovery for a device
func (dhm *DeviceHealthMonitor) triggerRecovery(deviceType string, metrics *DeviceHealthMetrics) {
atomic.AddInt64(&metrics.RecoveryAttempts, 1)
dhm.logger.Warn().
Str("device_type", deviceType).
Str("status", metrics.CurrentStatus.String()).
Int64("consecutive_errors", atomic.LoadInt64(&metrics.ConsecutiveErrors)).
Float64("error_rate", metrics.ErrorRate).
Msg("triggering device recovery")
// Try registered recovery callbacks
dhm.callbackMutex.RLock()
defer dhm.callbackMutex.RUnlock()
for component, callback := range dhm.recoveryCallbacks {
if callback != nil {
go func(comp string, cb func() error) {
if err := cb(); err != nil {
dhm.logger.Error().
Str("component", comp).
Str("device_type", deviceType).
Err(err).
Msg("recovery callback failed")
} else {
atomic.AddInt64(&metrics.SuccessfulRecoveries, 1)
dhm.logger.Info().
Str("component", comp).
Str("device_type", deviceType).
Msg("recovery callback succeeded")
}
}(component, callback)
}
}
}
// Global device health monitor instance
var (
globalDeviceHealthMonitor *DeviceHealthMonitor
deviceHealthOnce sync.Once
)
// GetDeviceHealthMonitor returns the global device health monitor
func GetDeviceHealthMonitor() *DeviceHealthMonitor {
deviceHealthOnce.Do(func() {
globalDeviceHealthMonitor = NewDeviceHealthMonitor()
})
return globalDeviceHealthMonitor
}
// Helper functions for min/max
func min(a, b float64) float64 {
if a < b {
return a
}
return b
}
func max(a, b float64) float64 {
if a > b {
return a
}
return b
}

View File

@ -93,18 +93,18 @@ type BufferPoolEfficiencyTracker struct {
// NewLatencyHistogram creates a new latency histogram with predefined buckets // NewLatencyHistogram creates a new latency histogram with predefined buckets
func NewLatencyHistogram(maxSamples int, logger zerolog.Logger) *LatencyHistogram { func NewLatencyHistogram(maxSamples int, logger zerolog.Logger) *LatencyHistogram {
// Define latency buckets: 1ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s+ // Define latency buckets using configuration constants
buckets := []int64{ buckets := []int64{
int64(1 * time.Millisecond), int64(1 * time.Millisecond),
int64(5 * time.Millisecond), int64(5 * time.Millisecond),
int64(10 * time.Millisecond), int64(GetConfig().LatencyBucket10ms),
int64(25 * time.Millisecond), int64(GetConfig().LatencyBucket25ms),
int64(50 * time.Millisecond), int64(GetConfig().LatencyBucket50ms),
int64(100 * time.Millisecond), int64(GetConfig().LatencyBucket100ms),
int64(250 * time.Millisecond), int64(GetConfig().LatencyBucket250ms),
int64(500 * time.Millisecond), int64(GetConfig().LatencyBucket500ms),
int64(1 * time.Second), int64(GetConfig().LatencyBucket1s),
int64(2 * time.Second), int64(GetConfig().LatencyBucket2s),
} }
return &LatencyHistogram{ return &LatencyHistogram{

View File

@ -10,10 +10,10 @@ import (
// AudioInputMetrics holds metrics for microphone input // AudioInputMetrics holds metrics for microphone input
type AudioInputMetrics struct { type AudioInputMetrics struct {
FramesSent int64 FramesSent int64 // Total frames sent
FramesDropped int64 FramesDropped int64 // Total frames dropped
BytesProcessed int64 BytesProcessed int64 // Total bytes processed
ConnectionDrops int64 ConnectionDrops int64 // Connection drops
AverageLatency time.Duration // time.Duration is int64 AverageLatency time.Duration // time.Duration is int64
LastFrameTime time.Time LastFrameTime time.Time
} }

View File

@ -13,6 +13,7 @@ import (
"time" "time"
"github.com/jetkvm/kvm/internal/logging" "github.com/jetkvm/kvm/internal/logging"
"github.com/rs/zerolog"
) )
var ( var (
@ -99,16 +100,15 @@ var globalMessagePool = &MessagePool{
var messagePoolInitOnce sync.Once var messagePoolInitOnce sync.Once
// initializeMessagePool initializes the message pool with pre-allocated messages // initializeMessagePool initializes the global message pool with pre-allocated messages
func initializeMessagePool() { func initializeMessagePool() {
messagePoolInitOnce.Do(func() { messagePoolInitOnce.Do(func() {
// Pre-allocate 30% of pool size for immediate availability preallocSize := messagePoolSize / 4 // 25% pre-allocated for immediate use
preallocSize := messagePoolSize * GetConfig().InputPreallocPercentage / 100
globalMessagePool.preallocSize = preallocSize globalMessagePool.preallocSize = preallocSize
globalMessagePool.maxPoolSize = messagePoolSize * GetConfig().PoolGrowthMultiplier // Allow growth up to 2x globalMessagePool.maxPoolSize = messagePoolSize * GetConfig().PoolGrowthMultiplier // Allow growth up to 2x
globalMessagePool.preallocated = make([]*OptimizedIPCMessage, 0, preallocSize) globalMessagePool.preallocated = make([]*OptimizedIPCMessage, 0, preallocSize)
// Pre-allocate messages to reduce initial allocation overhead // Pre-allocate messages for immediate use
for i := 0; i < preallocSize; i++ { for i := 0; i < preallocSize; i++ {
msg := &OptimizedIPCMessage{ msg := &OptimizedIPCMessage{
data: make([]byte, 0, maxFrameSize), data: make([]byte, 0, maxFrameSize),
@ -116,7 +116,7 @@ func initializeMessagePool() {
globalMessagePool.preallocated = append(globalMessagePool.preallocated, msg) globalMessagePool.preallocated = append(globalMessagePool.preallocated, msg)
} }
// Fill the channel pool with remaining messages // Fill the channel with remaining messages
for i := preallocSize; i < messagePoolSize; i++ { for i := preallocSize; i < messagePoolSize; i++ {
globalMessagePool.pool <- &OptimizedIPCMessage{ globalMessagePool.pool <- &OptimizedIPCMessage{
data: make([]byte, 0, maxFrameSize), data: make([]byte, 0, maxFrameSize),
@ -488,33 +488,13 @@ func (ais *AudioInputServer) sendAck() error {
return ais.writeMessage(ais.conn, msg) return ais.writeMessage(ais.conn, msg)
} }
// writeMessage writes a message to the connection using optimized buffers // Global shared message pool for input IPC server
var globalInputServerMessagePool = NewGenericMessagePool(messagePoolSize)
// writeMessage writes a message to the connection using shared common utilities
func (ais *AudioInputServer) writeMessage(conn net.Conn, msg *InputIPCMessage) error { func (ais *AudioInputServer) writeMessage(conn net.Conn, msg *InputIPCMessage) error {
// Get optimized message from pool for header preparation // Use shared WriteIPCMessage function with global message pool
optMsg := globalMessagePool.Get() return WriteIPCMessage(conn, msg, globalInputServerMessagePool, &ais.droppedFrames)
defer globalMessagePool.Put(optMsg)
// Prepare header in pre-allocated buffer
binary.LittleEndian.PutUint32(optMsg.header[0:4], msg.Magic)
optMsg.header[4] = byte(msg.Type)
binary.LittleEndian.PutUint32(optMsg.header[5:9], msg.Length)
binary.LittleEndian.PutUint64(optMsg.header[9:17], uint64(msg.Timestamp))
// Write header
_, err := conn.Write(optMsg.header[:])
if err != nil {
return err
}
// Write data if present
if msg.Length > 0 && msg.Data != nil {
_, err = conn.Write(msg.Data)
if err != nil {
return err
}
}
return nil
} }
// AudioInputClient handles IPC communication from the main process // AudioInputClient handles IPC communication from the main process
@ -706,21 +686,15 @@ func (aic *AudioInputClient) SendHeartbeat() error {
} }
// writeMessage writes a message to the server // writeMessage writes a message to the server
// Global shared message pool for input IPC clients
var globalInputMessagePool = NewGenericMessagePool(messagePoolSize)
func (aic *AudioInputClient) writeMessage(msg *InputIPCMessage) error { func (aic *AudioInputClient) writeMessage(msg *InputIPCMessage) error {
// Increment total frames counter // Increment total frames counter
atomic.AddInt64(&aic.totalFrames, 1) atomic.AddInt64(&aic.totalFrames, 1)
// Use common write function with shared message pool // Use shared WriteIPCMessage function with global message pool
sharedPool := &GenericMessagePool{ return WriteIPCMessage(aic.conn, msg, globalInputMessagePool, &aic.droppedFrames)
pool: make(chan *OptimizedMessage, messagePoolSize),
hitCount: globalMessagePool.hitCount,
missCount: globalMessagePool.missCount,
preallocated: make([]*OptimizedMessage, 0),
preallocSize: messagePoolSize / 4,
maxPoolSize: messagePoolSize,
}
return WriteIPCMessage(aic.conn, msg, sharedPool, &aic.droppedFrames)
} }
// IsConnected returns whether the client is connected // IsConnected returns whether the client is connected
@ -752,6 +726,17 @@ func (ais *AudioInputServer) startReaderGoroutine() {
ais.wg.Add(1) ais.wg.Add(1)
go func() { go func() {
defer ais.wg.Done() defer ais.wg.Done()
// Enhanced error tracking and recovery
var consecutiveErrors int
var lastErrorTime time.Time
maxConsecutiveErrors := GetConfig().MaxConsecutiveErrors
errorResetWindow := GetConfig().RestartWindow // Use existing restart window
baseBackoffDelay := GetConfig().RetryDelay
maxBackoffDelay := GetConfig().MaxRetryDelay
logger := logging.GetDefaultLogger().With().Str("component", "audio-input-reader").Logger()
for { for {
select { select {
case <-ais.stopChan: case <-ais.stopChan:
@ -760,8 +745,55 @@ func (ais *AudioInputServer) startReaderGoroutine() {
if ais.conn != nil { if ais.conn != nil {
msg, err := ais.readMessage(ais.conn) msg, err := ais.readMessage(ais.conn)
if err != nil { if err != nil {
continue // Connection error, retry // Enhanced error handling with progressive backoff
now := time.Now()
// Reset error counter if enough time has passed
if now.Sub(lastErrorTime) > errorResetWindow {
consecutiveErrors = 0
} }
consecutiveErrors++
lastErrorTime = now
// Log error with context
logger.Warn().Err(err).
Int("consecutive_errors", consecutiveErrors).
Msg("Failed to read message from input connection")
// Progressive backoff based on error count
if consecutiveErrors > 1 {
backoffDelay := time.Duration(consecutiveErrors-1) * baseBackoffDelay
if backoffDelay > maxBackoffDelay {
backoffDelay = maxBackoffDelay
}
time.Sleep(backoffDelay)
}
// If too many consecutive errors, close connection to force reconnect
if consecutiveErrors >= maxConsecutiveErrors {
logger.Error().
Int("consecutive_errors", consecutiveErrors).
Msg("Too many consecutive read errors, closing connection")
ais.mtx.Lock()
if ais.conn != nil {
ais.conn.Close()
ais.conn = nil
}
ais.mtx.Unlock()
consecutiveErrors = 0 // Reset for next connection
}
continue
}
// Reset error counter on successful read
if consecutiveErrors > 0 {
consecutiveErrors = 0
logger.Info().Msg("Input connection recovered")
}
// Send to message channel with non-blocking write // Send to message channel with non-blocking write
select { select {
case ais.messageChan <- msg: case ais.messageChan <- msg:
@ -769,7 +801,11 @@ func (ais *AudioInputServer) startReaderGoroutine() {
default: default:
// Channel full, drop message // Channel full, drop message
atomic.AddInt64(&ais.droppedFrames, 1) atomic.AddInt64(&ais.droppedFrames, 1)
logger.Warn().Msg("Message channel full, dropping frame")
} }
} else {
// No connection, wait briefly before checking again
time.Sleep(GetConfig().DefaultSleepDuration)
} }
} }
} }
@ -794,12 +830,73 @@ func (ais *AudioInputServer) startProcessorGoroutine() {
} }
}() }()
// Enhanced error tracking for processing
var processingErrors int
var lastProcessingError time.Time
maxProcessingErrors := GetConfig().MaxConsecutiveErrors
errorResetWindow := GetConfig().RestartWindow
defer ais.wg.Done() defer ais.wg.Done()
for { for {
select { select {
case <-ais.stopChan: case <-ais.stopChan:
return return
case msg := <-ais.messageChan: case msg := <-ais.messageChan:
// Process message with error handling
start := time.Now()
err := ais.processMessageWithRecovery(msg, logger)
processingTime := time.Since(start)
if err != nil {
// Track processing errors
now := time.Now()
if now.Sub(lastProcessingError) > errorResetWindow {
processingErrors = 0
}
processingErrors++
lastProcessingError = now
logger.Warn().Err(err).
Int("processing_errors", processingErrors).
Dur("processing_time", processingTime).
Msg("Failed to process input message")
// If too many processing errors, drop frames more aggressively
if processingErrors >= maxProcessingErrors {
logger.Error().
Int("processing_errors", processingErrors).
Msg("Too many processing errors, entering aggressive drop mode")
// Clear processing queue to recover
for len(ais.processChan) > 0 {
select {
case <-ais.processChan:
atomic.AddInt64(&ais.droppedFrames, 1)
default:
break
}
}
processingErrors = 0 // Reset after clearing queue
}
continue
}
// Reset error counter on successful processing
if processingErrors > 0 {
processingErrors = 0
logger.Info().Msg("Input processing recovered")
}
// Update processing time metrics
atomic.StoreInt64(&ais.processingTime, processingTime.Nanoseconds())
}
}
}()
}
// processMessageWithRecovery processes a message with enhanced error recovery
func (ais *AudioInputServer) processMessageWithRecovery(msg *InputIPCMessage, logger zerolog.Logger) error {
// Intelligent frame dropping: prioritize recent frames // Intelligent frame dropping: prioritize recent frames
if msg.Type == InputMessageTypeOpusFrame { if msg.Type == InputMessageTypeOpusFrame {
// Check if processing queue is getting full // Check if processing queue is getting full
@ -811,21 +908,25 @@ func (ais *AudioInputServer) startProcessorGoroutine() {
select { select {
case <-ais.processChan: // Remove oldest case <-ais.processChan: // Remove oldest
atomic.AddInt64(&ais.droppedFrames, 1) atomic.AddInt64(&ais.droppedFrames, 1)
logger.Debug().Msg("Dropped oldest frame to make room")
default: default:
} }
} }
} }
// Send to processing queue // Send to processing queue with timeout
select { select {
case ais.processChan <- msg: case ais.processChan <- msg:
default: return nil
// Processing queue full, drop frame case <-time.After(GetConfig().WriteTimeout):
// Processing queue full and timeout reached, drop frame
atomic.AddInt64(&ais.droppedFrames, 1) atomic.AddInt64(&ais.droppedFrames, 1)
return fmt.Errorf("processing queue timeout")
default:
// Processing queue full, drop frame immediately
atomic.AddInt64(&ais.droppedFrames, 1)
return fmt.Errorf("processing queue full")
} }
}
}
}()
} }
// startMonitorGoroutine starts the performance monitoring goroutine // startMonitorGoroutine starts the performance monitoring goroutine

View File

@ -1,7 +1,6 @@
package audio package audio
import ( import (
"context"
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"io" "io"
@ -65,59 +64,8 @@ func (msg *OutputIPCMessage) GetData() []byte {
return msg.Data return msg.Data
} }
// OutputOptimizedMessage represents a pre-allocated message for zero-allocation operations // Global shared message pool for output IPC client header reading
type OutputOptimizedMessage struct { var globalOutputClientMessagePool = NewGenericMessagePool(GetConfig().OutputMessagePoolSize)
header [17]byte // Pre-allocated header buffer (using constant value since array size must be compile-time constant)
data []byte // Reusable data buffer
}
// OutputMessagePool manages pre-allocated messages for zero-allocation IPC
type OutputMessagePool struct {
pool chan *OutputOptimizedMessage
}
// NewOutputMessagePool creates a new message pool
func NewOutputMessagePool(size int) *OutputMessagePool {
pool := &OutputMessagePool{
pool: make(chan *OutputOptimizedMessage, size),
}
// Pre-allocate messages
for i := 0; i < size; i++ {
msg := &OutputOptimizedMessage{
data: make([]byte, GetConfig().OutputMaxFrameSize),
}
pool.pool <- msg
}
return pool
}
// Get retrieves a message from the pool
func (p *OutputMessagePool) Get() *OutputOptimizedMessage {
select {
case msg := <-p.pool:
return msg
default:
// Pool exhausted, create new message
return &OutputOptimizedMessage{
data: make([]byte, GetConfig().OutputMaxFrameSize),
}
}
}
// Put returns a message to the pool
func (p *OutputMessagePool) Put(msg *OutputOptimizedMessage) {
select {
case p.pool <- msg:
// Successfully returned to pool
default:
// Pool full, let GC handle it
}
}
// Global message pool for output IPC
var globalOutputMessagePool = NewOutputMessagePool(GetConfig().OutputMessagePoolSize)
type AudioOutputServer struct { type AudioOutputServer struct {
// Atomic fields MUST be first for ARM32 alignment (int64 fields need 8-byte alignment) // Atomic fields MUST be first for ARM32 alignment (int64 fields need 8-byte alignment)
@ -341,6 +289,9 @@ func (s *AudioOutputServer) SendFrame(frame []byte) error {
} }
// sendFrameToClient sends frame data directly to the connected client // sendFrameToClient sends frame data directly to the connected client
// Global shared message pool for output IPC server
var globalOutputServerMessagePool = NewGenericMessagePool(GetConfig().OutputMessagePoolSize)
func (s *AudioOutputServer) sendFrameToClient(frame []byte) error { func (s *AudioOutputServer) sendFrameToClient(frame []byte) error {
s.mtx.Lock() s.mtx.Lock()
defer s.mtx.Unlock() defer s.mtx.Unlock()
@ -351,59 +302,28 @@ func (s *AudioOutputServer) sendFrameToClient(frame []byte) error {
start := time.Now() start := time.Now()
// Get optimized message from pool // Create output IPC message
optMsg := globalOutputMessagePool.Get() msg := &OutputIPCMessage{
defer globalOutputMessagePool.Put(optMsg) Magic: outputMagicNumber,
Type: OutputMessageTypeOpusFrame,
// Prepare header in pre-allocated buffer Length: uint32(len(frame)),
binary.LittleEndian.PutUint32(optMsg.header[0:4], outputMagicNumber) Timestamp: start.UnixNano(),
optMsg.header[4] = byte(OutputMessageTypeOpusFrame) Data: frame,
binary.LittleEndian.PutUint32(optMsg.header[5:9], uint32(len(frame)))
binary.LittleEndian.PutUint64(optMsg.header[9:17], uint64(start.UnixNano()))
// Use non-blocking write with timeout
ctx, cancel := context.WithTimeout(context.Background(), GetConfig().OutputWriteTimeout)
defer cancel()
// Create a channel to signal write completion
done := make(chan error, 1)
go func() {
// Write header using pre-allocated buffer
_, err := s.conn.Write(optMsg.header[:])
if err != nil {
done <- err
return
} }
// Write frame data // Use shared WriteIPCMessage function
if len(frame) > 0 { err := WriteIPCMessage(s.conn, msg, globalOutputServerMessagePool, &s.droppedFrames)
_, err = s.conn.Write(frame)
if err != nil { if err != nil {
done <- err
return
}
}
done <- nil
}()
// Wait for completion or timeout
select {
case err := <-done:
if err != nil {
atomic.AddInt64(&s.droppedFrames, 1)
return err return err
} }
// Record latency for monitoring // Record latency for monitoring
if s.latencyMonitor != nil { if s.latencyMonitor != nil {
writeLatency := time.Since(start) writeLatency := time.Since(start)
s.latencyMonitor.RecordLatency(writeLatency, "ipc_write") s.latencyMonitor.RecordLatency(writeLatency, "ipc_write")
} }
return nil return nil
case <-ctx.Done():
// Timeout occurred - drop frame to prevent blocking
atomic.AddInt64(&s.droppedFrames, 1)
return fmt.Errorf("write timeout after %v - frame dropped to prevent blocking", GetConfig().OutputWriteTimeout)
}
} }
// GetServerStats returns server performance statistics // GetServerStats returns server performance statistics
@ -495,8 +415,8 @@ func (c *AudioOutputClient) ReceiveFrame() ([]byte, error) {
} }
// Get optimized message from pool for header reading // Get optimized message from pool for header reading
optMsg := globalOutputMessagePool.Get() optMsg := globalOutputClientMessagePool.Get()
defer globalOutputMessagePool.Put(optMsg) defer globalOutputClientMessagePool.Put(optMsg)
// Read header // Read header
if _, err := io.ReadFull(c.conn, optMsg.header[:]); err != nil { if _, err := io.ReadFull(c.conn, optMsg.header[:]); err != nil {

View File

@ -321,17 +321,61 @@ func StartAudioOutputStreaming(send func([]byte)) error {
getOutputStreamingLogger().Info().Str("socket_path", getOutputSocketPath()).Msg("Audio output streaming started, connected to output server") getOutputStreamingLogger().Info().Str("socket_path", getOutputSocketPath()).Msg("Audio output streaming started, connected to output server")
buffer := make([]byte, GetMaxAudioFrameSize()) buffer := make([]byte, GetMaxAudioFrameSize())
consecutiveErrors := 0
maxConsecutiveErrors := GetConfig().MaxConsecutiveErrors
errorBackoffDelay := GetConfig().RetryDelay
maxErrorBackoff := GetConfig().MaxRetryDelay
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return return
default: default:
// Capture audio frame // Capture audio frame with enhanced error handling
n, err := CGOAudioReadEncode(buffer) n, err := CGOAudioReadEncode(buffer)
if err != nil { if err != nil {
getOutputStreamingLogger().Warn().Err(err).Msg("Failed to read/encode audio") consecutiveErrors++
getOutputStreamingLogger().Warn().
Err(err).
Int("consecutive_errors", consecutiveErrors).
Msg("Failed to read/encode audio")
// Implement progressive backoff for consecutive errors
if consecutiveErrors >= maxConsecutiveErrors {
getOutputStreamingLogger().Error().
Int("consecutive_errors", consecutiveErrors).
Msg("Too many consecutive audio errors, attempting recovery")
// Try to reinitialize audio system
CGOAudioClose()
time.Sleep(errorBackoffDelay)
if initErr := CGOAudioInit(); initErr != nil {
getOutputStreamingLogger().Error().
Err(initErr).
Msg("Failed to reinitialize audio system")
// Exponential backoff for reinitialization failures
errorBackoffDelay = time.Duration(float64(errorBackoffDelay) * GetConfig().BackoffMultiplier)
if errorBackoffDelay > maxErrorBackoff {
errorBackoffDelay = maxErrorBackoff
}
} else {
getOutputStreamingLogger().Info().Msg("Audio system reinitialized successfully")
consecutiveErrors = 0
errorBackoffDelay = GetConfig().RetryDelay // Reset backoff
}
} else {
// Brief delay for transient errors
time.Sleep(GetConfig().ShortSleepDuration)
}
continue continue
} }
// Success - reset error counters
if consecutiveErrors > 0 {
consecutiveErrors = 0
errorBackoffDelay = GetConfig().RetryDelay
}
if n > 0 { if n > 0 {
// Get frame buffer from pool to reduce allocations // Get frame buffer from pool to reduce allocations
frame := GetAudioFrameBuffer() frame := GetAudioFrameBuffer()

View File

@ -0,0 +1,281 @@
package audio
import (
"errors"
"fmt"
"time"
"unsafe"
)
// Enhanced validation errors with more specific context
var (
ErrInvalidFrameLength = errors.New("invalid frame length")
ErrFrameDataCorrupted = errors.New("frame data appears corrupted")
ErrBufferAlignment = errors.New("buffer alignment invalid")
ErrInvalidSampleFormat = errors.New("invalid sample format")
ErrInvalidTimestamp = errors.New("invalid timestamp")
ErrConfigurationMismatch = errors.New("configuration mismatch")
ErrResourceExhaustion = errors.New("resource exhaustion detected")
ErrInvalidPointer = errors.New("invalid pointer")
ErrBufferOverflow = errors.New("buffer overflow detected")
ErrInvalidState = errors.New("invalid state")
)
// ValidationLevel defines the level of validation to perform
type ValidationLevel int
const (
ValidationMinimal ValidationLevel = iota // Only critical safety checks
ValidationStandard // Standard validation for production
ValidationStrict // Comprehensive validation for debugging
)
// ValidationConfig controls validation behavior
type ValidationConfig struct {
Level ValidationLevel
EnableRangeChecks bool
EnableAlignmentCheck bool
EnableDataIntegrity bool
MaxValidationTime time.Duration
}
// GetValidationConfig returns the current validation configuration
func GetValidationConfig() ValidationConfig {
config := GetConfig()
return ValidationConfig{
Level: ValidationStandard,
EnableRangeChecks: true,
EnableAlignmentCheck: true,
EnableDataIntegrity: false, // Disabled by default for performance
MaxValidationTime: time.Duration(config.ValidationTimeoutMS) * time.Millisecond,
}
}
// ValidateAudioFrameFast performs minimal validation for performance-critical paths
func ValidateAudioFrameFast(data []byte) error {
if len(data) == 0 {
return ErrInvalidFrameData
}
// Quick bounds check using config constants
maxSize := GetConfig().MaxAudioFrameSize
if len(data) > maxSize {
return fmt.Errorf("%w: frame size %d exceeds maximum %d", ErrInvalidFrameSize, len(data), maxSize)
}
return nil
}
// ValidateAudioFrameComprehensive performs thorough validation
func ValidateAudioFrameComprehensive(data []byte, expectedSampleRate int, expectedChannels int) error {
validationConfig := GetValidationConfig()
start := time.Now()
// Timeout protection for validation
defer func() {
if time.Since(start) > validationConfig.MaxValidationTime {
// Log validation timeout but don't fail
getValidationLogger().Warn().Dur("duration", time.Since(start)).Msg("validation timeout exceeded")
}
}()
// Basic validation first
if err := ValidateAudioFrameFast(data); err != nil {
return err
}
// Range validation
if validationConfig.EnableRangeChecks {
config := GetConfig()
if len(data) < config.MinAudioFrameSize {
return fmt.Errorf("%w: frame size %d below minimum %d", ErrInvalidFrameSize, len(data), config.MinAudioFrameSize)
}
// Validate frame length matches expected sample format
expectedFrameSize := (expectedSampleRate * expectedChannels * 2) / 1000 * int(config.AudioQualityMediumFrameSize/time.Millisecond)
if abs(len(data)-expectedFrameSize) > config.FrameSizeTolerance {
return fmt.Errorf("%w: frame size %d doesn't match expected %d (±%d)", ErrInvalidFrameLength, len(data), expectedFrameSize, config.FrameSizeTolerance)
}
}
// Alignment validation for ARM32 compatibility
if validationConfig.EnableAlignmentCheck {
if uintptr(unsafe.Pointer(&data[0]))%4 != 0 {
return fmt.Errorf("%w: buffer not 4-byte aligned for ARM32", ErrBufferAlignment)
}
}
// Data integrity checks (expensive, only for debugging)
if validationConfig.EnableDataIntegrity && validationConfig.Level == ValidationStrict {
if err := validateAudioDataIntegrity(data, expectedChannels); err != nil {
return err
}
}
return nil
}
// ValidateZeroCopyFrameEnhanced performs enhanced zero-copy frame validation
func ValidateZeroCopyFrameEnhanced(frame *ZeroCopyAudioFrame) error {
if frame == nil {
return fmt.Errorf("%w: frame is nil", ErrInvalidPointer)
}
// Check reference count validity
frame.mutex.RLock()
refCount := frame.refCount
length := frame.length
capacity := frame.capacity
frame.mutex.RUnlock()
if refCount <= 0 {
return fmt.Errorf("%w: invalid reference count %d", ErrInvalidState, refCount)
}
if length < 0 || capacity < 0 {
return fmt.Errorf("%w: negative length (%d) or capacity (%d)", ErrInvalidState, length, capacity)
}
if length > capacity {
return fmt.Errorf("%w: length %d exceeds capacity %d", ErrBufferOverflow, length, capacity)
}
// Validate the underlying data
data := frame.Data()
return ValidateAudioFrameFast(data)
}
// ValidateBufferBounds performs bounds checking with overflow protection
func ValidateBufferBounds(buffer []byte, offset, length int) error {
if buffer == nil {
return fmt.Errorf("%w: buffer is nil", ErrInvalidPointer)
}
if offset < 0 {
return fmt.Errorf("%w: negative offset %d", ErrInvalidState, offset)
}
if length < 0 {
return fmt.Errorf("%w: negative length %d", ErrInvalidState, length)
}
// Check for integer overflow
if offset > len(buffer) {
return fmt.Errorf("%w: offset %d exceeds buffer length %d", ErrBufferOverflow, offset, len(buffer))
}
// Safe addition check for overflow
if offset+length < offset || offset+length > len(buffer) {
return fmt.Errorf("%w: range [%d:%d] exceeds buffer length %d", ErrBufferOverflow, offset, offset+length, len(buffer))
}
return nil
}
// ValidateAudioConfiguration performs comprehensive configuration validation
func ValidateAudioConfiguration(config AudioConfig) error {
if err := ValidateAudioQuality(config.Quality); err != nil {
return fmt.Errorf("quality validation failed: %w", err)
}
configConstants := GetConfig()
// Validate bitrate ranges
if config.Bitrate < configConstants.MinBitrate || config.Bitrate > configConstants.MaxBitrate {
return fmt.Errorf("%w: bitrate %d outside valid range [%d, %d]", ErrInvalidConfiguration, config.Bitrate, configConstants.MinBitrate, configConstants.MaxBitrate)
}
// Validate sample rate
validSampleRates := []int{8000, 12000, 16000, 24000, 48000}
validSampleRate := false
for _, rate := range validSampleRates {
if config.SampleRate == rate {
validSampleRate = true
break
}
}
if !validSampleRate {
return fmt.Errorf("%w: sample rate %d not in supported rates %v", ErrInvalidSampleRate, config.SampleRate, validSampleRates)
}
// Validate channels
if config.Channels < 1 || config.Channels > configConstants.MaxChannels {
return fmt.Errorf("%w: channels %d outside valid range [1, %d]", ErrInvalidChannels, config.Channels, configConstants.MaxChannels)
}
// Validate frame size
if config.FrameSize < configConstants.MinFrameSize || config.FrameSize > configConstants.MaxFrameSize {
return fmt.Errorf("%w: frame size %v outside valid range [%v, %v]", ErrInvalidConfiguration, config.FrameSize, configConstants.MinFrameSize, configConstants.MaxFrameSize)
}
return nil
}
// ValidateResourceLimits checks if system resources are within acceptable limits
func ValidateResourceLimits() error {
config := GetConfig()
// Check buffer pool sizes
framePoolStats := GetAudioBufferPoolStats()
if framePoolStats.FramePoolSize > int64(config.MaxPoolSize*2) {
return fmt.Errorf("%w: frame pool size %d exceeds safe limit %d", ErrResourceExhaustion, framePoolStats.FramePoolSize, config.MaxPoolSize*2)
}
// Check zero-copy pool allocation count
zeroCopyStats := GetGlobalZeroCopyPoolStats()
if zeroCopyStats.AllocationCount > int64(config.MaxPoolSize*3) {
return fmt.Errorf("%w: zero-copy allocations %d exceed safe limit %d", ErrResourceExhaustion, zeroCopyStats.AllocationCount, config.MaxPoolSize*3)
}
return nil
}
// validateAudioDataIntegrity performs expensive data integrity checks
func validateAudioDataIntegrity(data []byte, channels int) error {
if len(data)%2 != 0 {
return fmt.Errorf("%w: odd number of bytes for 16-bit samples", ErrInvalidSampleFormat)
}
if len(data)%(channels*2) != 0 {
return fmt.Errorf("%w: data length %d not aligned to channel count %d", ErrInvalidSampleFormat, len(data), channels)
}
// Check for obvious corruption patterns (all zeros, all max values)
sampleCount := len(data) / 2
zeroCount := 0
maxCount := 0
for i := 0; i < len(data); i += 2 {
sample := int16(data[i]) | int16(data[i+1])<<8
if sample == 0 {
zeroCount++
} else if sample == 32767 || sample == -32768 {
maxCount++
}
}
// Flag suspicious patterns
if zeroCount > sampleCount*9/10 {
return fmt.Errorf("%w: %d%% zero samples suggests silence or corruption", ErrFrameDataCorrupted, (zeroCount*100)/sampleCount)
}
if maxCount > sampleCount/10 {
return fmt.Errorf("%w: %d%% max-value samples suggests clipping or corruption", ErrFrameDataCorrupted, (maxCount*100)/sampleCount)
}
return nil
}
// Helper function for absolute value
func abs(x int) int {
if x < 0 {
return -x
}
return x
}
// getValidationLogger returns a logger for validation operations
func getValidationLogger() *zerolog.Logger {
logger := logging.GetDefaultLogger().With().Str("component", "audio-validation").Logger()
return &logger
}