mirror of https://github.com/jetkvm/kvm.git
515 lines
14 KiB
Go
515 lines
14 KiB
Go
package audio
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/jetkvm/kvm/internal/logging"
|
|
"github.com/rs/zerolog"
|
|
)
|
|
|
|
// DeviceHealthStatus represents the health status of an audio device
|
|
type DeviceHealthStatus int
|
|
|
|
const (
|
|
DeviceHealthUnknown DeviceHealthStatus = iota
|
|
DeviceHealthHealthy
|
|
DeviceHealthDegraded
|
|
DeviceHealthFailing
|
|
DeviceHealthCritical
|
|
)
|
|
|
|
func (s DeviceHealthStatus) String() string {
|
|
switch s {
|
|
case DeviceHealthHealthy:
|
|
return "healthy"
|
|
case DeviceHealthDegraded:
|
|
return "degraded"
|
|
case DeviceHealthFailing:
|
|
return "failing"
|
|
case DeviceHealthCritical:
|
|
return "critical"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
// DeviceHealthMetrics tracks health-related metrics for audio devices
|
|
type DeviceHealthMetrics struct {
|
|
// Error tracking
|
|
ConsecutiveErrors int64 `json:"consecutive_errors"`
|
|
TotalErrors int64 `json:"total_errors"`
|
|
LastErrorTime time.Time `json:"last_error_time"`
|
|
ErrorRate float64 `json:"error_rate"` // errors per minute
|
|
|
|
// Performance metrics
|
|
AverageLatency time.Duration `json:"average_latency"`
|
|
MaxLatency time.Duration `json:"max_latency"`
|
|
LatencySpikes int64 `json:"latency_spikes"`
|
|
Underruns int64 `json:"underruns"`
|
|
Overruns int64 `json:"overruns"`
|
|
|
|
// Device availability
|
|
LastSuccessfulOp time.Time `json:"last_successful_op"`
|
|
DeviceDisconnects int64 `json:"device_disconnects"`
|
|
RecoveryAttempts int64 `json:"recovery_attempts"`
|
|
SuccessfulRecoveries int64 `json:"successful_recoveries"`
|
|
|
|
// Health assessment
|
|
CurrentStatus DeviceHealthStatus `json:"current_status"`
|
|
StatusLastChanged time.Time `json:"status_last_changed"`
|
|
HealthScore float64 `json:"health_score"` // 0.0 to 1.0
|
|
}
|
|
|
|
// DeviceHealthMonitor monitors the health of audio devices and triggers recovery
|
|
type DeviceHealthMonitor struct {
|
|
// Atomic fields first for ARM32 alignment
|
|
running int32
|
|
monitoringEnabled int32
|
|
|
|
// Configuration
|
|
checkInterval time.Duration
|
|
recoveryThreshold int
|
|
latencyThreshold time.Duration
|
|
errorRateLimit float64 // max errors per minute
|
|
|
|
// State tracking
|
|
captureMetrics *DeviceHealthMetrics
|
|
playbackMetrics *DeviceHealthMetrics
|
|
mutex sync.RWMutex
|
|
|
|
// Control channels
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
stopChan chan struct{}
|
|
doneChan chan struct{}
|
|
|
|
// Recovery callbacks
|
|
recoveryCallbacks map[string]func() error
|
|
callbackMutex sync.RWMutex
|
|
|
|
// Logging
|
|
logger zerolog.Logger
|
|
config *AudioConfigConstants
|
|
}
|
|
|
|
// NewDeviceHealthMonitor creates a new device health monitor
|
|
func NewDeviceHealthMonitor() *DeviceHealthMonitor {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
config := GetConfig()
|
|
|
|
return &DeviceHealthMonitor{
|
|
checkInterval: time.Duration(config.HealthCheckIntervalMS) * time.Millisecond,
|
|
recoveryThreshold: config.HealthRecoveryThreshold,
|
|
latencyThreshold: time.Duration(config.HealthLatencyThresholdMS) * time.Millisecond,
|
|
errorRateLimit: config.HealthErrorRateLimit,
|
|
captureMetrics: &DeviceHealthMetrics{
|
|
CurrentStatus: DeviceHealthUnknown,
|
|
HealthScore: 1.0,
|
|
},
|
|
playbackMetrics: &DeviceHealthMetrics{
|
|
CurrentStatus: DeviceHealthUnknown,
|
|
HealthScore: 1.0,
|
|
},
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
stopChan: make(chan struct{}),
|
|
doneChan: make(chan struct{}),
|
|
recoveryCallbacks: make(map[string]func() error),
|
|
logger: logging.GetDefaultLogger().With().Str("component", "device-health-monitor").Logger(),
|
|
config: config,
|
|
}
|
|
}
|
|
|
|
// Start begins health monitoring
|
|
func (dhm *DeviceHealthMonitor) Start() error {
|
|
if !atomic.CompareAndSwapInt32(&dhm.running, 0, 1) {
|
|
return fmt.Errorf("device health monitor already running")
|
|
}
|
|
|
|
dhm.logger.Info().Msg("starting device health monitor")
|
|
atomic.StoreInt32(&dhm.monitoringEnabled, 1)
|
|
|
|
go dhm.monitoringLoop()
|
|
return nil
|
|
}
|
|
|
|
// Stop stops health monitoring
|
|
func (dhm *DeviceHealthMonitor) Stop() {
|
|
if !atomic.CompareAndSwapInt32(&dhm.running, 1, 0) {
|
|
return
|
|
}
|
|
|
|
dhm.logger.Info().Msg("stopping device health monitor")
|
|
atomic.StoreInt32(&dhm.monitoringEnabled, 0)
|
|
|
|
close(dhm.stopChan)
|
|
dhm.cancel()
|
|
|
|
// Wait for monitoring loop to finish
|
|
select {
|
|
case <-dhm.doneChan:
|
|
dhm.logger.Info().Msg("device health monitor stopped")
|
|
case <-time.After(time.Duration(dhm.config.SupervisorTimeout)):
|
|
dhm.logger.Warn().Msg("device health monitor stop timeout")
|
|
}
|
|
}
|
|
|
|
// RegisterRecoveryCallback registers a recovery function for a specific component
|
|
func (dhm *DeviceHealthMonitor) RegisterRecoveryCallback(component string, callback func() error) {
|
|
dhm.callbackMutex.Lock()
|
|
defer dhm.callbackMutex.Unlock()
|
|
dhm.recoveryCallbacks[component] = callback
|
|
dhm.logger.Info().Str("component", component).Msg("registered recovery callback")
|
|
}
|
|
|
|
// RecordError records an error for health tracking
|
|
func (dhm *DeviceHealthMonitor) RecordError(deviceType string, err error) {
|
|
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
|
|
return
|
|
}
|
|
|
|
dhm.mutex.Lock()
|
|
defer dhm.mutex.Unlock()
|
|
|
|
var metrics *DeviceHealthMetrics
|
|
switch deviceType {
|
|
case "capture":
|
|
metrics = dhm.captureMetrics
|
|
case "playback":
|
|
metrics = dhm.playbackMetrics
|
|
default:
|
|
dhm.logger.Warn().Str("device_type", deviceType).Msg("unknown device type for error recording")
|
|
return
|
|
}
|
|
|
|
atomic.AddInt64(&metrics.ConsecutiveErrors, 1)
|
|
atomic.AddInt64(&metrics.TotalErrors, 1)
|
|
metrics.LastErrorTime = time.Now()
|
|
|
|
// Update error rate (errors per minute)
|
|
if !metrics.LastErrorTime.IsZero() {
|
|
timeSinceFirst := time.Since(metrics.LastErrorTime)
|
|
if timeSinceFirst > 0 {
|
|
metrics.ErrorRate = float64(metrics.TotalErrors) / timeSinceFirst.Minutes()
|
|
}
|
|
}
|
|
|
|
dhm.logger.Debug().
|
|
Str("device_type", deviceType).
|
|
Err(err).
|
|
Int64("consecutive_errors", metrics.ConsecutiveErrors).
|
|
Float64("error_rate", metrics.ErrorRate).
|
|
Msg("recorded device error")
|
|
|
|
// Trigger immediate health assessment
|
|
dhm.assessDeviceHealth(deviceType, metrics)
|
|
}
|
|
|
|
// RecordSuccess records a successful operation
|
|
func (dhm *DeviceHealthMonitor) RecordSuccess(deviceType string) {
|
|
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
|
|
return
|
|
}
|
|
|
|
dhm.mutex.Lock()
|
|
defer dhm.mutex.Unlock()
|
|
|
|
var metrics *DeviceHealthMetrics
|
|
switch deviceType {
|
|
case "capture":
|
|
metrics = dhm.captureMetrics
|
|
case "playback":
|
|
metrics = dhm.playbackMetrics
|
|
default:
|
|
return
|
|
}
|
|
|
|
// Reset consecutive errors on success
|
|
atomic.StoreInt64(&metrics.ConsecutiveErrors, 0)
|
|
metrics.LastSuccessfulOp = time.Now()
|
|
|
|
// Improve health score gradually
|
|
if metrics.HealthScore < 1.0 {
|
|
metrics.HealthScore = min(1.0, metrics.HealthScore+0.1)
|
|
}
|
|
}
|
|
|
|
// RecordLatency records operation latency for health assessment
|
|
func (dhm *DeviceHealthMonitor) RecordLatency(deviceType string, latency time.Duration) {
|
|
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
|
|
return
|
|
}
|
|
|
|
dhm.mutex.Lock()
|
|
defer dhm.mutex.Unlock()
|
|
|
|
var metrics *DeviceHealthMetrics
|
|
switch deviceType {
|
|
case "capture":
|
|
metrics = dhm.captureMetrics
|
|
case "playback":
|
|
metrics = dhm.playbackMetrics
|
|
default:
|
|
return
|
|
}
|
|
|
|
// Update latency metrics
|
|
if metrics.AverageLatency == 0 {
|
|
metrics.AverageLatency = latency
|
|
} else {
|
|
// Exponential moving average
|
|
metrics.AverageLatency = time.Duration(float64(metrics.AverageLatency)*0.9 + float64(latency)*0.1)
|
|
}
|
|
|
|
if latency > metrics.MaxLatency {
|
|
metrics.MaxLatency = latency
|
|
}
|
|
|
|
// Track latency spikes
|
|
if latency > dhm.latencyThreshold {
|
|
atomic.AddInt64(&metrics.LatencySpikes, 1)
|
|
}
|
|
}
|
|
|
|
// RecordUnderrun records an audio underrun event
|
|
func (dhm *DeviceHealthMonitor) RecordUnderrun(deviceType string) {
|
|
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
|
|
return
|
|
}
|
|
|
|
dhm.mutex.Lock()
|
|
defer dhm.mutex.Unlock()
|
|
|
|
var metrics *DeviceHealthMetrics
|
|
switch deviceType {
|
|
case "capture":
|
|
metrics = dhm.captureMetrics
|
|
case "playback":
|
|
metrics = dhm.playbackMetrics
|
|
default:
|
|
return
|
|
}
|
|
|
|
atomic.AddInt64(&metrics.Underruns, 1)
|
|
dhm.logger.Debug().Str("device_type", deviceType).Msg("recorded audio underrun")
|
|
}
|
|
|
|
// RecordOverrun records an audio overrun event
|
|
func (dhm *DeviceHealthMonitor) RecordOverrun(deviceType string) {
|
|
if atomic.LoadInt32(&dhm.monitoringEnabled) == 0 {
|
|
return
|
|
}
|
|
|
|
dhm.mutex.Lock()
|
|
defer dhm.mutex.Unlock()
|
|
|
|
var metrics *DeviceHealthMetrics
|
|
switch deviceType {
|
|
case "capture":
|
|
metrics = dhm.captureMetrics
|
|
case "playback":
|
|
metrics = dhm.playbackMetrics
|
|
default:
|
|
return
|
|
}
|
|
|
|
atomic.AddInt64(&metrics.Overruns, 1)
|
|
dhm.logger.Debug().Str("device_type", deviceType).Msg("recorded audio overrun")
|
|
}
|
|
|
|
// GetHealthMetrics returns current health metrics
|
|
func (dhm *DeviceHealthMonitor) GetHealthMetrics() (capture, playback DeviceHealthMetrics) {
|
|
dhm.mutex.RLock()
|
|
defer dhm.mutex.RUnlock()
|
|
return *dhm.captureMetrics, *dhm.playbackMetrics
|
|
}
|
|
|
|
// monitoringLoop runs the main health monitoring loop
|
|
func (dhm *DeviceHealthMonitor) monitoringLoop() {
|
|
defer close(dhm.doneChan)
|
|
|
|
ticker := time.NewTicker(dhm.checkInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-dhm.stopChan:
|
|
return
|
|
case <-dhm.ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
dhm.performHealthCheck()
|
|
}
|
|
}
|
|
}
|
|
|
|
// performHealthCheck performs a comprehensive health check
|
|
func (dhm *DeviceHealthMonitor) performHealthCheck() {
|
|
dhm.mutex.Lock()
|
|
defer dhm.mutex.Unlock()
|
|
|
|
// Assess health for both devices
|
|
dhm.assessDeviceHealth("capture", dhm.captureMetrics)
|
|
dhm.assessDeviceHealth("playback", dhm.playbackMetrics)
|
|
|
|
// Check if recovery is needed
|
|
dhm.checkRecoveryNeeded("capture", dhm.captureMetrics)
|
|
dhm.checkRecoveryNeeded("playback", dhm.playbackMetrics)
|
|
}
|
|
|
|
// assessDeviceHealth assesses the health status of a device
|
|
func (dhm *DeviceHealthMonitor) assessDeviceHealth(deviceType string, metrics *DeviceHealthMetrics) {
|
|
previousStatus := metrics.CurrentStatus
|
|
newStatus := dhm.calculateHealthStatus(metrics)
|
|
|
|
if newStatus != previousStatus {
|
|
metrics.CurrentStatus = newStatus
|
|
metrics.StatusLastChanged = time.Now()
|
|
dhm.logger.Info().
|
|
Str("device_type", deviceType).
|
|
Str("previous_status", previousStatus.String()).
|
|
Str("new_status", newStatus.String()).
|
|
Float64("health_score", metrics.HealthScore).
|
|
Msg("device health status changed")
|
|
}
|
|
|
|
// Update health score
|
|
metrics.HealthScore = dhm.calculateHealthScore(metrics)
|
|
}
|
|
|
|
// calculateHealthStatus determines health status based on metrics
|
|
func (dhm *DeviceHealthMonitor) calculateHealthStatus(metrics *DeviceHealthMetrics) DeviceHealthStatus {
|
|
consecutiveErrors := atomic.LoadInt64(&metrics.ConsecutiveErrors)
|
|
totalErrors := atomic.LoadInt64(&metrics.TotalErrors)
|
|
|
|
// Critical: Too many consecutive errors or device disconnected recently
|
|
if consecutiveErrors >= int64(dhm.recoveryThreshold) {
|
|
return DeviceHealthCritical
|
|
}
|
|
|
|
// Critical: No successful operations in a long time
|
|
if !metrics.LastSuccessfulOp.IsZero() && time.Since(metrics.LastSuccessfulOp) > time.Duration(dhm.config.SupervisorTimeout) {
|
|
return DeviceHealthCritical
|
|
}
|
|
|
|
// Failing: High error rate or frequent latency spikes
|
|
if metrics.ErrorRate > dhm.errorRateLimit || atomic.LoadInt64(&metrics.LatencySpikes) > int64(dhm.config.MaxDroppedFrames) {
|
|
return DeviceHealthFailing
|
|
}
|
|
|
|
// Degraded: Some errors or performance issues
|
|
if consecutiveErrors > 0 || totalErrors > int64(dhm.config.MaxDroppedFrames/2) || metrics.AverageLatency > dhm.latencyThreshold {
|
|
return DeviceHealthDegraded
|
|
}
|
|
|
|
// Healthy: No significant issues
|
|
return DeviceHealthHealthy
|
|
}
|
|
|
|
// calculateHealthScore calculates a numeric health score (0.0 to 1.0)
|
|
func (dhm *DeviceHealthMonitor) calculateHealthScore(metrics *DeviceHealthMetrics) float64 {
|
|
score := 1.0
|
|
|
|
// Penalize consecutive errors
|
|
consecutiveErrors := atomic.LoadInt64(&metrics.ConsecutiveErrors)
|
|
if consecutiveErrors > 0 {
|
|
score -= float64(consecutiveErrors) * 0.1
|
|
}
|
|
|
|
// Penalize high error rate
|
|
if metrics.ErrorRate > 0 {
|
|
score -= min(0.5, metrics.ErrorRate/dhm.errorRateLimit*0.5)
|
|
}
|
|
|
|
// Penalize high latency
|
|
if metrics.AverageLatency > dhm.latencyThreshold {
|
|
excess := float64(metrics.AverageLatency-dhm.latencyThreshold) / float64(dhm.latencyThreshold)
|
|
score -= min(0.3, excess*0.3)
|
|
}
|
|
|
|
// Penalize underruns/overruns
|
|
underruns := atomic.LoadInt64(&metrics.Underruns)
|
|
overruns := atomic.LoadInt64(&metrics.Overruns)
|
|
if underruns+overruns > 0 {
|
|
score -= min(0.2, float64(underruns+overruns)*0.01)
|
|
}
|
|
|
|
return max(0.0, score)
|
|
}
|
|
|
|
// checkRecoveryNeeded checks if recovery is needed and triggers it
|
|
func (dhm *DeviceHealthMonitor) checkRecoveryNeeded(deviceType string, metrics *DeviceHealthMetrics) {
|
|
if metrics.CurrentStatus == DeviceHealthCritical {
|
|
dhm.triggerRecovery(deviceType, metrics)
|
|
}
|
|
}
|
|
|
|
// triggerRecovery triggers recovery for a device
|
|
func (dhm *DeviceHealthMonitor) triggerRecovery(deviceType string, metrics *DeviceHealthMetrics) {
|
|
atomic.AddInt64(&metrics.RecoveryAttempts, 1)
|
|
|
|
dhm.logger.Warn().
|
|
Str("device_type", deviceType).
|
|
Str("status", metrics.CurrentStatus.String()).
|
|
Int64("consecutive_errors", atomic.LoadInt64(&metrics.ConsecutiveErrors)).
|
|
Float64("error_rate", metrics.ErrorRate).
|
|
Msg("triggering device recovery")
|
|
|
|
// Try registered recovery callbacks
|
|
dhm.callbackMutex.RLock()
|
|
defer dhm.callbackMutex.RUnlock()
|
|
|
|
for component, callback := range dhm.recoveryCallbacks {
|
|
if callback != nil {
|
|
go func(comp string, cb func() error) {
|
|
if err := cb(); err != nil {
|
|
dhm.logger.Error().
|
|
Str("component", comp).
|
|
Str("device_type", deviceType).
|
|
Err(err).
|
|
Msg("recovery callback failed")
|
|
} else {
|
|
atomic.AddInt64(&metrics.SuccessfulRecoveries, 1)
|
|
dhm.logger.Info().
|
|
Str("component", comp).
|
|
Str("device_type", deviceType).
|
|
Msg("recovery callback succeeded")
|
|
}
|
|
}(component, callback)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Global device health monitor instance
|
|
var (
|
|
globalDeviceHealthMonitor *DeviceHealthMonitor
|
|
deviceHealthOnce sync.Once
|
|
)
|
|
|
|
// GetDeviceHealthMonitor returns the global device health monitor
|
|
func GetDeviceHealthMonitor() *DeviceHealthMonitor {
|
|
deviceHealthOnce.Do(func() {
|
|
globalDeviceHealthMonitor = NewDeviceHealthMonitor()
|
|
})
|
|
return globalDeviceHealthMonitor
|
|
}
|
|
|
|
// Helper functions for min/max
|
|
func min(a, b float64) float64 {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func max(a, b float64) float64 {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|