kvm/internal/audio/device_health.go

515 lines
14 KiB
Go

package audio
import (
"context"
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/jetkvm/kvm/internal/logging"
"github.com/rs/zerolog"
)
// DeviceHealthStatus represents the health status of an audio device
type DeviceHealthStatus int
const (
DeviceHealthUnknown DeviceHealthStatus = iota
DeviceHealthHealthy
DeviceHealthDegraded
DeviceHealthFailing
DeviceHealthCritical
)
func (s DeviceHealthStatus) String() string {
switch s {
case DeviceHealthHealthy:
return "healthy"
case DeviceHealthDegraded:
return "degraded"
case DeviceHealthFailing:
return "failing"
case DeviceHealthCritical:
return "critical"
default:
return "unknown"
}
}
// DeviceHealthMetrics tracks health-related metrics for audio devices
type DeviceHealthMetrics struct {
// Error tracking
ConsecutiveErrors int64 `json:"consecutive_errors"`
TotalErrors int64 `json:"total_errors"`
LastErrorTime time.Time `json:"last_error_time"`
ErrorRate float64 `json:"error_rate"` // errors per minute
// Performance metrics
AverageLatency time.Duration `json:"average_latency"`
MaxLatency time.Duration `json:"max_latency"`
LatencySpikes int64 `json:"latency_spikes"`
Underruns int64 `json:"underruns"`
Overruns int64 `json:"overruns"`
// Device availability
LastSuccessfulOp time.Time `json:"last_successful_op"`
DeviceDisconnects int64 `json:"device_disconnects"`
RecoveryAttempts int64 `json:"recovery_attempts"`
SuccessfulRecoveries int64 `json:"successful_recoveries"`
// Health assessment
CurrentStatus DeviceHealthStatus `json:"current_status"`
StatusLastChanged time.Time `json:"status_last_changed"`
HealthScore float64 `json:"health_score"` // 0.0 to 1.0
}
// DeviceHealthMonitor monitors the health of audio devices and triggers recovery
type DeviceHealthMonitor struct {
// Atomic fields first for ARM32 alignment
running int32
monitoringEnabled int32
// Configuration
checkInterval time.Duration
recoveryThreshold int
latencyThreshold time.Duration
errorRateLimit float64 // max errors per minute
// State tracking
captureMetrics *DeviceHealthMetrics
playbackMetrics *DeviceHealthMetrics
mutex sync.RWMutex
// Control channels
ctx context.Context
cancel context.CancelFunc
stopChan chan struct{}
doneChan chan struct{}
// Recovery callbacks
recoveryCallbacks map[string]func() error
callbackMutex sync.RWMutex
// Logging
logger zerolog.Logger
config *AudioConfigConstants
}
// NewDeviceHealthMonitor creates a new device health monitor
func NewDeviceHealthMonitor() *DeviceHealthMonitor {
ctx, cancel := context.WithCancel(context.Background())
config := GetConfig()
return &DeviceHealthMonitor{
checkInterval: time.Duration(config.HealthCheckIntervalMS) * time.Millisecond,
recoveryThreshold: config.HealthRecoveryThreshold,
latencyThreshold: time.Duration(config.HealthLatencyThresholdMS) * time.Millisecond,
errorRateLimit: config.HealthErrorRateLimit,
captureMetrics: &DeviceHealthMetrics{
CurrentStatus: DeviceHealthUnknown,
HealthScore: 1.0,
},
playbackMetrics: &DeviceHealthMetrics{
CurrentStatus: DeviceHealthUnknown,
HealthScore: 1.0,
},
ctx: ctx,
cancel: cancel,
stopChan: make(chan struct{}),
doneChan: make(chan struct{}),
recoveryCallbacks: make(map[string]func() error),
logger: logging.GetDefaultLogger().With().Str("component", "device-health-monitor").Logger(),
config: config,
}
}
// Start begins health monitoring
func (dhm *DeviceHealthMonitor) Start() error {
if !atomic.CompareAndSwapInt32(&dhm.running, 0, 1) {
return fmt.Errorf("device health monitor already running")
}
dhm.logger.Info().Msg("starting device health monitor")
atomic.StoreInt32(&dhm.monitoringEnabled, 1)
go dhm.monitoringLoop()
return nil
}
// Stop stops health monitoring
func (dhm *DeviceHealthMonitor) Stop() {
if !atomic.CompareAndSwapInt32(&dhm.running, 1, 0) {
return
}
dhm.logger.Info().Msg("stopping device health monitor")
atomic.StoreInt32(&dhm.monitoringEnabled, 0)
close(dhm.stopChan)
dhm.cancel()
// Wait for monitoring loop to finish
select {
case <-dhm.doneChan:
dhm.logger.Info().Msg("device health monitor stopped")
case <-time.After(time.Duration(dhm.config.SupervisorTimeout)):
dhm.logger.Warn().Msg("device health monitor stop timeout")
}
}
// RegisterRecoveryCallback registers a recovery function for a specific component
func (dhm *DeviceHealthMonitor) RegisterRecoveryCallback(component string, callback func() error) {
dhm.callbackMutex.Lock()
defer dhm.callbackMutex.Unlock()
dhm.recoveryCallbacks[component] = callback
dhm.logger.Info().Str("component", component).Msg("registered recovery callback")
}
// RecordError records an error for health tracking
func (dhm *DeviceHealthMonitor) RecordError(deviceType string, err error) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
dhm.logger.Warn().Str("device_type", deviceType).Msg("unknown device type for error recording")
return
}
atomic.AddInt64(&metrics.ConsecutiveErrors, 1)
atomic.AddInt64(&metrics.TotalErrors, 1)
metrics.LastErrorTime = time.Now()
// Update error rate (errors per minute)
if !metrics.LastErrorTime.IsZero() {
timeSinceFirst := time.Since(metrics.LastErrorTime)
if timeSinceFirst > 0 {
metrics.ErrorRate = float64(metrics.TotalErrors) / timeSinceFirst.Minutes()
}
}
dhm.logger.Debug().
Str("device_type", deviceType).
Err(err).
Int64("consecutive_errors", metrics.ConsecutiveErrors).
Float64("error_rate", metrics.ErrorRate).
Msg("recorded device error")
// Trigger immediate health assessment
dhm.assessDeviceHealth(deviceType, metrics)
}
// RecordSuccess records a successful operation
func (dhm *DeviceHealthMonitor) RecordSuccess(deviceType string) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
return
}
// Reset consecutive errors on success
atomic.StoreInt64(&metrics.ConsecutiveErrors, 0)
metrics.LastSuccessfulOp = time.Now()
// Improve health score gradually
if metrics.HealthScore < 1.0 {
metrics.HealthScore = min(1.0, metrics.HealthScore+0.1)
}
}
// RecordLatency records operation latency for health assessment
func (dhm *DeviceHealthMonitor) RecordLatency(deviceType string, latency time.Duration) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
return
}
// Update latency metrics
if metrics.AverageLatency == 0 {
metrics.AverageLatency = latency
} else {
// Exponential moving average
metrics.AverageLatency = time.Duration(float64(metrics.AverageLatency)*0.9 + float64(latency)*0.1)
}
if latency > metrics.MaxLatency {
metrics.MaxLatency = latency
}
// Track latency spikes
if latency > dhm.latencyThreshold {
atomic.AddInt64(&metrics.LatencySpikes, 1)
}
}
// RecordUnderrun records an audio underrun event
func (dhm *DeviceHealthMonitor) RecordUnderrun(deviceType string) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
return
}
atomic.AddInt64(&metrics.Underruns, 1)
dhm.logger.Debug().Str("device_type", deviceType).Msg("recorded audio underrun")
}
// RecordOverrun records an audio overrun event
func (dhm *DeviceHealthMonitor) RecordOverrun(deviceType string) {
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
return
}
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
var metrics *DeviceHealthMetrics
switch deviceType {
case "capture":
metrics = dhm.captureMetrics
case "playback":
metrics = dhm.playbackMetrics
default:
return
}
atomic.AddInt64(&metrics.Overruns, 1)
dhm.logger.Debug().Str("device_type", deviceType).Msg("recorded audio overrun")
}
// GetHealthMetrics returns current health metrics
func (dhm *DeviceHealthMonitor) GetHealthMetrics() (capture, playback DeviceHealthMetrics) {
dhm.mutex.RLock()
defer dhm.mutex.RUnlock()
return *dhm.captureMetrics, *dhm.playbackMetrics
}
// monitoringLoop runs the main health monitoring loop
func (dhm *DeviceHealthMonitor) monitoringLoop() {
defer close(dhm.doneChan)
ticker := time.NewTicker(dhm.checkInterval)
defer ticker.Stop()
for {
select {
case <-dhm.stopChan:
return
case <-dhm.ctx.Done():
return
case <-ticker.C:
dhm.performHealthCheck()
}
}
}
// performHealthCheck performs a comprehensive health check
func (dhm *DeviceHealthMonitor) performHealthCheck() {
dhm.mutex.Lock()
defer dhm.mutex.Unlock()
// Assess health for both devices
dhm.assessDeviceHealth("capture", dhm.captureMetrics)
dhm.assessDeviceHealth("playback", dhm.playbackMetrics)
// Check if recovery is needed
dhm.checkRecoveryNeeded("capture", dhm.captureMetrics)
dhm.checkRecoveryNeeded("playback", dhm.playbackMetrics)
}
// assessDeviceHealth assesses the health status of a device
func (dhm *DeviceHealthMonitor) assessDeviceHealth(deviceType string, metrics *DeviceHealthMetrics) {
previousStatus := metrics.CurrentStatus
newStatus := dhm.calculateHealthStatus(metrics)
if newStatus != previousStatus {
metrics.CurrentStatus = newStatus
metrics.StatusLastChanged = time.Now()
dhm.logger.Info().
Str("device_type", deviceType).
Str("previous_status", previousStatus.String()).
Str("new_status", newStatus.String()).
Float64("health_score", metrics.HealthScore).
Msg("device health status changed")
}
// Update health score
metrics.HealthScore = dhm.calculateHealthScore(metrics)
}
// calculateHealthStatus determines health status based on metrics
func (dhm *DeviceHealthMonitor) calculateHealthStatus(metrics *DeviceHealthMetrics) DeviceHealthStatus {
consecutiveErrors := atomic.LoadInt64(&metrics.ConsecutiveErrors)
totalErrors := atomic.LoadInt64(&metrics.TotalErrors)
// Critical: Too many consecutive errors or device disconnected recently
if consecutiveErrors >= int64(dhm.recoveryThreshold) {
return DeviceHealthCritical
}
// Critical: No successful operations in a long time
if !metrics.LastSuccessfulOp.IsZero() && time.Since(metrics.LastSuccessfulOp) > time.Duration(dhm.config.SupervisorTimeout) {
return DeviceHealthCritical
}
// Failing: High error rate or frequent latency spikes
if metrics.ErrorRate > dhm.errorRateLimit || atomic.LoadInt64(&metrics.LatencySpikes) > int64(dhm.config.MaxDroppedFrames) {
return DeviceHealthFailing
}
// Degraded: Some errors or performance issues
if consecutiveErrors > 0 || totalErrors > int64(dhm.config.MaxDroppedFrames/2) || metrics.AverageLatency > dhm.latencyThreshold {
return DeviceHealthDegraded
}
// Healthy: No significant issues
return DeviceHealthHealthy
}
// calculateHealthScore calculates a numeric health score (0.0 to 1.0)
func (dhm *DeviceHealthMonitor) calculateHealthScore(metrics *DeviceHealthMetrics) float64 {
score := 1.0
// Penalize consecutive errors
consecutiveErrors := atomic.LoadInt64(&metrics.ConsecutiveErrors)
if consecutiveErrors > 0 {
score -= float64(consecutiveErrors) * 0.1
}
// Penalize high error rate
if metrics.ErrorRate > 0 {
score -= min(0.5, metrics.ErrorRate/dhm.errorRateLimit*0.5)
}
// Penalize high latency
if metrics.AverageLatency > dhm.latencyThreshold {
excess := float64(metrics.AverageLatency-dhm.latencyThreshold) / float64(dhm.latencyThreshold)
score -= min(0.3, excess*0.3)
}
// Penalize underruns/overruns
underruns := atomic.LoadInt64(&metrics.Underruns)
overruns := atomic.LoadInt64(&metrics.Overruns)
if underruns+overruns > 0 {
score -= min(0.2, float64(underruns+overruns)*0.01)
}
return max(0.0, score)
}
// checkRecoveryNeeded checks if recovery is needed and triggers it
func (dhm *DeviceHealthMonitor) checkRecoveryNeeded(deviceType string, metrics *DeviceHealthMetrics) {
if metrics.CurrentStatus == DeviceHealthCritical {
dhm.triggerRecovery(deviceType, metrics)
}
}
// triggerRecovery triggers recovery for a device
func (dhm *DeviceHealthMonitor) triggerRecovery(deviceType string, metrics *DeviceHealthMetrics) {
atomic.AddInt64(&metrics.RecoveryAttempts, 1)
dhm.logger.Warn().
Str("device_type", deviceType).
Str("status", metrics.CurrentStatus.String()).
Int64("consecutive_errors", atomic.LoadInt64(&metrics.ConsecutiveErrors)).
Float64("error_rate", metrics.ErrorRate).
Msg("triggering device recovery")
// Try registered recovery callbacks
dhm.callbackMutex.RLock()
defer dhm.callbackMutex.RUnlock()
for component, callback := range dhm.recoveryCallbacks {
if callback != nil {
go func(comp string, cb func() error) {
if err := cb(); err != nil {
dhm.logger.Error().
Str("component", comp).
Str("device_type", deviceType).
Err(err).
Msg("recovery callback failed")
} else {
atomic.AddInt64(&metrics.SuccessfulRecoveries, 1)
dhm.logger.Info().
Str("component", comp).
Str("device_type", deviceType).
Msg("recovery callback succeeded")
}
}(component, callback)
}
}
}
// Global device health monitor instance
var (
globalDeviceHealthMonitor *DeviceHealthMonitor
deviceHealthOnce sync.Once
)
// GetDeviceHealthMonitor returns the global device health monitor
func GetDeviceHealthMonitor() *DeviceHealthMonitor {
deviceHealthOnce.Do(func() {
globalDeviceHealthMonitor = NewDeviceHealthMonitor()
})
return globalDeviceHealthMonitor
}
// Helper functions for min/max
func min(a, b float64) float64 {
if a < b {
return a
}
return b
}
func max(a, b float64) float64 {
if a > b {
return a
}
return b
}