package audio import ( "context" "fmt" "sync" "sync/atomic" "time" "github.com/jetkvm/kvm/internal/logging" "github.com/rs/zerolog" ) // DeviceHealthStatus represents the health status of an audio device type DeviceHealthStatus int const ( DeviceHealthUnknown DeviceHealthStatus = iota DeviceHealthHealthy DeviceHealthDegraded DeviceHealthFailing DeviceHealthCritical ) func (s DeviceHealthStatus) String() string { switch s { case DeviceHealthHealthy: return "healthy" case DeviceHealthDegraded: return "degraded" case DeviceHealthFailing: return "failing" case DeviceHealthCritical: return "critical" default: return "unknown" } } // DeviceHealthMetrics tracks health-related metrics for audio devices type DeviceHealthMetrics struct { // Error tracking ConsecutiveErrors int64 `json:"consecutive_errors"` TotalErrors int64 `json:"total_errors"` LastErrorTime time.Time `json:"last_error_time"` ErrorRate float64 `json:"error_rate"` // errors per minute // Performance metrics AverageLatency time.Duration `json:"average_latency"` MaxLatency time.Duration `json:"max_latency"` LatencySpikes int64 `json:"latency_spikes"` Underruns int64 `json:"underruns"` Overruns int64 `json:"overruns"` // Device availability LastSuccessfulOp time.Time `json:"last_successful_op"` DeviceDisconnects int64 `json:"device_disconnects"` RecoveryAttempts int64 `json:"recovery_attempts"` SuccessfulRecoveries int64 `json:"successful_recoveries"` // Health assessment CurrentStatus DeviceHealthStatus `json:"current_status"` StatusLastChanged time.Time `json:"status_last_changed"` HealthScore float64 `json:"health_score"` // 0.0 to 1.0 } // DeviceHealthMonitor monitors the health of audio devices and triggers recovery type DeviceHealthMonitor struct { // Atomic fields first for ARM32 alignment running int32 monitoringEnabled int32 // Configuration checkInterval time.Duration recoveryThreshold int latencyThreshold time.Duration errorRateLimit float64 // max errors per minute // State tracking captureMetrics *DeviceHealthMetrics playbackMetrics *DeviceHealthMetrics mutex sync.RWMutex // Control channels ctx context.Context cancel context.CancelFunc stopChan chan struct{} doneChan chan struct{} // Recovery callbacks recoveryCallbacks map[string]func() error callbackMutex sync.RWMutex // Logging logger zerolog.Logger config *AudioConfigConstants } // NewDeviceHealthMonitor creates a new device health monitor func NewDeviceHealthMonitor() *DeviceHealthMonitor { ctx, cancel := context.WithCancel(context.Background()) config := GetConfig() return &DeviceHealthMonitor{ checkInterval: time.Duration(config.HealthCheckIntervalMS) * time.Millisecond, recoveryThreshold: config.HealthRecoveryThreshold, latencyThreshold: time.Duration(config.HealthLatencyThresholdMS) * time.Millisecond, errorRateLimit: config.HealthErrorRateLimit, captureMetrics: &DeviceHealthMetrics{ CurrentStatus: DeviceHealthUnknown, HealthScore: 1.0, }, playbackMetrics: &DeviceHealthMetrics{ CurrentStatus: DeviceHealthUnknown, HealthScore: 1.0, }, ctx: ctx, cancel: cancel, stopChan: make(chan struct{}), doneChan: make(chan struct{}), recoveryCallbacks: make(map[string]func() error), logger: logging.GetDefaultLogger().With().Str("component", "device-health-monitor").Logger(), config: config, } } // Start begins health monitoring func (dhm *DeviceHealthMonitor) Start() error { if !atomic.CompareAndSwapInt32(&dhm.running, 0, 1) { return fmt.Errorf("device health monitor already running") } dhm.logger.Info().Msg("starting device health monitor") atomic.StoreInt32(&dhm.monitoringEnabled, 1) go dhm.monitoringLoop() return nil } // Stop stops health monitoring func (dhm *DeviceHealthMonitor) Stop() { if !atomic.CompareAndSwapInt32(&dhm.running, 1, 0) { return } dhm.logger.Info().Msg("stopping device health monitor") atomic.StoreInt32(&dhm.monitoringEnabled, 0) close(dhm.stopChan) dhm.cancel() // Wait for monitoring loop to finish select { case <-dhm.doneChan: dhm.logger.Info().Msg("device health monitor stopped") case <-time.After(time.Duration(dhm.config.SupervisorTimeout)): dhm.logger.Warn().Msg("device health monitor stop timeout") } } // RegisterRecoveryCallback registers a recovery function for a specific component func (dhm *DeviceHealthMonitor) RegisterRecoveryCallback(component string, callback func() error) { dhm.callbackMutex.Lock() defer dhm.callbackMutex.Unlock() dhm.recoveryCallbacks[component] = callback dhm.logger.Info().Str("component", component).Msg("registered recovery callback") } // RecordError records an error for health tracking func (dhm *DeviceHealthMonitor) RecordError(deviceType string, err error) { if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 { return } dhm.mutex.Lock() defer dhm.mutex.Unlock() var metrics *DeviceHealthMetrics switch deviceType { case "capture": metrics = dhm.captureMetrics case "playback": metrics = dhm.playbackMetrics default: dhm.logger.Warn().Str("device_type", deviceType).Msg("unknown device type for error recording") return } atomic.AddInt64(&metrics.ConsecutiveErrors, 1) atomic.AddInt64(&metrics.TotalErrors, 1) metrics.LastErrorTime = time.Now() // Update error rate (errors per minute) if !metrics.LastErrorTime.IsZero() { timeSinceFirst := time.Since(metrics.LastErrorTime) if timeSinceFirst > 0 { metrics.ErrorRate = float64(metrics.TotalErrors) / timeSinceFirst.Minutes() } } dhm.logger.Debug(). Str("device_type", deviceType). Err(err). Int64("consecutive_errors", metrics.ConsecutiveErrors). Float64("error_rate", metrics.ErrorRate). Msg("recorded device error") // Trigger immediate health assessment dhm.assessDeviceHealth(deviceType, metrics) } // RecordSuccess records a successful operation func (dhm *DeviceHealthMonitor) RecordSuccess(deviceType string) { if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 { return } dhm.mutex.Lock() defer dhm.mutex.Unlock() var metrics *DeviceHealthMetrics switch deviceType { case "capture": metrics = dhm.captureMetrics case "playback": metrics = dhm.playbackMetrics default: return } // Reset consecutive errors on success atomic.StoreInt64(&metrics.ConsecutiveErrors, 0) metrics.LastSuccessfulOp = time.Now() // Improve health score gradually if metrics.HealthScore < 1.0 { metrics.HealthScore = min(1.0, metrics.HealthScore+0.1) } } // RecordLatency records operation latency for health assessment func (dhm *DeviceHealthMonitor) RecordLatency(deviceType string, latency time.Duration) { if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 { return } dhm.mutex.Lock() defer dhm.mutex.Unlock() var metrics *DeviceHealthMetrics switch deviceType { case "capture": metrics = dhm.captureMetrics case "playback": metrics = dhm.playbackMetrics default: return } // Update latency metrics if metrics.AverageLatency == 0 { metrics.AverageLatency = latency } else { // Exponential moving average metrics.AverageLatency = time.Duration(float64(metrics.AverageLatency)*0.9 + float64(latency)*0.1) } if latency > metrics.MaxLatency { metrics.MaxLatency = latency } // Track latency spikes if latency > dhm.latencyThreshold { atomic.AddInt64(&metrics.LatencySpikes, 1) } } // RecordUnderrun records an audio underrun event func (dhm *DeviceHealthMonitor) RecordUnderrun(deviceType string) { if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 { return } dhm.mutex.Lock() defer dhm.mutex.Unlock() var metrics *DeviceHealthMetrics switch deviceType { case "capture": metrics = dhm.captureMetrics case "playback": metrics = dhm.playbackMetrics default: return } atomic.AddInt64(&metrics.Underruns, 1) dhm.logger.Debug().Str("device_type", deviceType).Msg("recorded audio underrun") } // RecordOverrun records an audio overrun event func (dhm *DeviceHealthMonitor) RecordOverrun(deviceType string) { if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 { return } dhm.mutex.Lock() defer dhm.mutex.Unlock() var metrics *DeviceHealthMetrics switch deviceType { case "capture": metrics = dhm.captureMetrics case "playback": metrics = dhm.playbackMetrics default: return } atomic.AddInt64(&metrics.Overruns, 1) dhm.logger.Debug().Str("device_type", deviceType).Msg("recorded audio overrun") } // GetHealthMetrics returns current health metrics func (dhm *DeviceHealthMonitor) GetHealthMetrics() (capture, playback DeviceHealthMetrics) { dhm.mutex.RLock() defer dhm.mutex.RUnlock() return *dhm.captureMetrics, *dhm.playbackMetrics } // monitoringLoop runs the main health monitoring loop func (dhm *DeviceHealthMonitor) monitoringLoop() { defer close(dhm.doneChan) ticker := time.NewTicker(dhm.checkInterval) defer ticker.Stop() for { select { case <-dhm.stopChan: return case <-dhm.ctx.Done(): return case <-ticker.C: dhm.performHealthCheck() } } } // performHealthCheck performs a comprehensive health check func (dhm *DeviceHealthMonitor) performHealthCheck() { dhm.mutex.Lock() defer dhm.mutex.Unlock() // Assess health for both devices dhm.assessDeviceHealth("capture", dhm.captureMetrics) dhm.assessDeviceHealth("playback", dhm.playbackMetrics) // Check if recovery is needed dhm.checkRecoveryNeeded("capture", dhm.captureMetrics) dhm.checkRecoveryNeeded("playback", dhm.playbackMetrics) } // assessDeviceHealth assesses the health status of a device func (dhm *DeviceHealthMonitor) assessDeviceHealth(deviceType string, metrics *DeviceHealthMetrics) { previousStatus := metrics.CurrentStatus newStatus := dhm.calculateHealthStatus(metrics) if newStatus != previousStatus { metrics.CurrentStatus = newStatus metrics.StatusLastChanged = time.Now() dhm.logger.Info(). Str("device_type", deviceType). Str("previous_status", previousStatus.String()). Str("new_status", newStatus.String()). Float64("health_score", metrics.HealthScore). Msg("device health status changed") } // Update health score metrics.HealthScore = dhm.calculateHealthScore(metrics) } // calculateHealthStatus determines health status based on metrics func (dhm *DeviceHealthMonitor) calculateHealthStatus(metrics *DeviceHealthMetrics) DeviceHealthStatus { consecutiveErrors := atomic.LoadInt64(&metrics.ConsecutiveErrors) totalErrors := atomic.LoadInt64(&metrics.TotalErrors) // Critical: Too many consecutive errors or device disconnected recently if consecutiveErrors >= int64(dhm.recoveryThreshold) { return DeviceHealthCritical } // Critical: No successful operations in a long time if !metrics.LastSuccessfulOp.IsZero() && time.Since(metrics.LastSuccessfulOp) > time.Duration(dhm.config.SupervisorTimeout) { return DeviceHealthCritical } // Failing: High error rate or frequent latency spikes if metrics.ErrorRate > dhm.errorRateLimit || atomic.LoadInt64(&metrics.LatencySpikes) > int64(dhm.config.MaxDroppedFrames) { return DeviceHealthFailing } // Degraded: Some errors or performance issues if consecutiveErrors > 0 || totalErrors > int64(dhm.config.MaxDroppedFrames/2) || metrics.AverageLatency > dhm.latencyThreshold { return DeviceHealthDegraded } // Healthy: No significant issues return DeviceHealthHealthy } // calculateHealthScore calculates a numeric health score (0.0 to 1.0) func (dhm *DeviceHealthMonitor) calculateHealthScore(metrics *DeviceHealthMetrics) float64 { score := 1.0 // Penalize consecutive errors consecutiveErrors := atomic.LoadInt64(&metrics.ConsecutiveErrors) if consecutiveErrors > 0 { score -= float64(consecutiveErrors) * 0.1 } // Penalize high error rate if metrics.ErrorRate > 0 { score -= min(0.5, metrics.ErrorRate/dhm.errorRateLimit*0.5) } // Penalize high latency if metrics.AverageLatency > dhm.latencyThreshold { excess := float64(metrics.AverageLatency-dhm.latencyThreshold) / float64(dhm.latencyThreshold) score -= min(0.3, excess*0.3) } // Penalize underruns/overruns underruns := atomic.LoadInt64(&metrics.Underruns) overruns := atomic.LoadInt64(&metrics.Overruns) if underruns+overruns > 0 { score -= min(0.2, float64(underruns+overruns)*0.01) } return max(0.0, score) } // checkRecoveryNeeded checks if recovery is needed and triggers it func (dhm *DeviceHealthMonitor) checkRecoveryNeeded(deviceType string, metrics *DeviceHealthMetrics) { if metrics.CurrentStatus == DeviceHealthCritical { dhm.triggerRecovery(deviceType, metrics) } } // triggerRecovery triggers recovery for a device func (dhm *DeviceHealthMonitor) triggerRecovery(deviceType string, metrics *DeviceHealthMetrics) { atomic.AddInt64(&metrics.RecoveryAttempts, 1) dhm.logger.Warn(). Str("device_type", deviceType). Str("status", metrics.CurrentStatus.String()). Int64("consecutive_errors", atomic.LoadInt64(&metrics.ConsecutiveErrors)). Float64("error_rate", metrics.ErrorRate). Msg("triggering device recovery") // Try registered recovery callbacks dhm.callbackMutex.RLock() defer dhm.callbackMutex.RUnlock() for component, callback := range dhm.recoveryCallbacks { if callback != nil { go func(comp string, cb func() error) { if err := cb(); err != nil { dhm.logger.Error(). Str("component", comp). Str("device_type", deviceType). Err(err). Msg("recovery callback failed") } else { atomic.AddInt64(&metrics.SuccessfulRecoveries, 1) dhm.logger.Info(). Str("component", comp). Str("device_type", deviceType). Msg("recovery callback succeeded") } }(component, callback) } } } // Global device health monitor instance var ( globalDeviceHealthMonitor *DeviceHealthMonitor deviceHealthOnce sync.Once ) // GetDeviceHealthMonitor returns the global device health monitor func GetDeviceHealthMonitor() *DeviceHealthMonitor { deviceHealthOnce.Do(func() { globalDeviceHealthMonitor = NewDeviceHealthMonitor() }) return globalDeviceHealthMonitor } // Helper functions for min/max func min(a, b float64) float64 { if a < b { return a } return b } func max(a, b float64) float64 { if a > b { return a } return b }