diff --git a/internal/timesync/http.go b/internal/timesync/http.go index 14b4633..908fd18 100644 --- a/internal/timesync/http.go +++ b/internal/timesync/http.go @@ -5,9 +5,20 @@ import ( "errors" "math/rand" "net/http" + "strconv" "time" ) +var defaultHTTPUrls = []string{ + "http://www.gstatic.com/generate_204", + "http://cp.cloudflare.com/", + "http://edge-http.microsoft.com/captiveportal/generate_204", + // Firefox, Apple, and Microsoft have inconsistent results, so we don't use it + // "http://detectportal.firefox.com/", + // "http://www.apple.com/library/test/success.html", + // "http://www.msftconnecttest.com/connecttest.txt", +} + func (t *TimeSync) queryAllHttpTime() (now *time.Time) { chunkSize := 4 httpUrls := t.httpUrls @@ -38,6 +49,9 @@ func (t *TimeSync) queryMultipleHttp(urls []string, timeout time.Duration) (now Str("http_url", url). Logger() + metricHttpRequestCount.WithLabelValues(url).Inc() + metricHttpTotalRequestCount.Inc() + startTime := time.Now() now, err, response := queryHttpTime( ctx, @@ -46,12 +60,22 @@ func (t *TimeSync) queryMultipleHttp(urls []string, timeout time.Duration) (now ) duration := time.Since(startTime) - var status int + metricHttpServerLastRTT.WithLabelValues(url).Set(float64(duration.Milliseconds())) + metricHttpServerRttHistogram.WithLabelValues(url).Observe(float64(duration.Milliseconds())) + + status := 0 if response != nil { status = response.StatusCode } + metricHttpServerInfo.WithLabelValues( + url, + strconv.Itoa(status), + ).Set(1) if err == nil { + metricHttpTotalSuccessCount.Inc() + metricHttpSuccessCount.WithLabelValues(url).Inc() + requestId := response.Header.Get("X-Request-Id") if requestId != "" { requestId = response.Header.Get("X-Msedge-Ref") @@ -68,7 +92,10 @@ func (t *TimeSync) queryMultipleHttp(urls []string, timeout time.Duration) (now cancel() results <- now - } else if !errors.Is(err, context.Canceled) { + } else if errors.Is(err, context.Canceled) { + metricHttpCancelCount.WithLabelValues(url).Inc() + metricHttpTotalCancelCount.Inc() + } else { scopedLogger.Warn(). Str("error", err.Error()). Int("status", status). diff --git a/internal/timesync/metrics.go b/internal/timesync/metrics.go new file mode 100644 index 0000000..0401f3a --- /dev/null +++ b/internal/timesync/metrics.go @@ -0,0 +1,147 @@ +package timesync + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + metricTimeSyncStatus = promauto.NewGauge( + prometheus.GaugeOpts{ + Name: "jetkvm_timesync_status", + Help: "The status of the timesync, 1 if successful, 0 if not", + }, + ) + metricTimeSyncCount = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_count", + Help: "The number of times the timesync has been run", + }, + ) + metricTimeSyncSuccessCount = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_success_count", + Help: "The number of times the timesync has been successful", + }, + ) + metricRTCUpdateCount = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_rtc_update_count", + Help: "The number of times the RTC has been updated", + }, + ) + metricNtpTotalSuccessCount = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_ntp_total_success_count", + Help: "The total number of successful NTP requests", + }, + ) + metricNtpTotalRequestCount = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_ntp_total_request_count", + Help: "The total number of NTP requests sent", + }, + ) + metricNtpSuccessCount = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_ntp_success_count", + Help: "The number of successful NTP requests", + }, + []string{"url"}, + ) + metricNtpRequestCount = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_ntp_request_count", + Help: "The number of NTP requests sent to the server", + }, + []string{"url"}, + ) + metricNtpServerLastRTT = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "jetkvm_timesync_ntp_server_last_rtt", + Help: "The last RTT of the NTP server in milliseconds", + }, + []string{"url"}, + ) + metricNtpServerRttHistogram = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "jetkvm_timesync_ntp_server_rtt", + Help: "The histogram of the RTT of the NTP server in milliseconds", + Buckets: []float64{ + 10, 25, 50, 100, 200, 300, 500, 1000, + }, + }, + []string{"url"}, + ) + metricNtpServerInfo = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "jetkvm_timesync_ntp_server_info", + Help: "The info of the NTP server", + }, + []string{"url", "reference", "stratum", "precision"}, + ) + + metricHttpTotalSuccessCount = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_http_total_success_count", + Help: "The total number of successful HTTP requests", + }, + ) + metricHttpTotalRequestCount = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_http_total_request_count", + Help: "The total number of HTTP requests sent", + }, + ) + metricHttpTotalCancelCount = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_http_total_cancel_count", + Help: "The total number of HTTP requests cancelled", + }, + ) + metricHttpSuccessCount = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_http_success_count", + Help: "The number of successful HTTP requests", + }, + []string{"url"}, + ) + metricHttpRequestCount = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_http_request_count", + Help: "The number of HTTP requests sent to the server", + }, + []string{"url"}, + ) + metricHttpCancelCount = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "jetkvm_timesync_http_cancel_count", + Help: "The number of HTTP requests cancelled", + }, + []string{"url"}, + ) + metricHttpServerLastRTT = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "jetkvm_timesync_http_server_last_rtt", + Help: "The last RTT of the HTTP server in milliseconds", + }, + []string{"url"}, + ) + metricHttpServerRttHistogram = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "jetkvm_timesync_http_server_rtt", + Help: "The histogram of the RTT of the HTTP server in milliseconds", + Buckets: []float64{ + 10, 25, 50, 100, 200, 300, 500, 1000, + }, + }, + []string{"url"}, + ) + metricHttpServerInfo = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "jetkvm_timesync_http_server_info", + Help: "The info of the HTTP server", + }, + []string{"url", "http_code"}, + ) +) diff --git a/internal/timesync/ntp.go b/internal/timesync/ntp.go index eb15ff9..34e87c9 100644 --- a/internal/timesync/ntp.go +++ b/internal/timesync/ntp.go @@ -2,12 +2,25 @@ package timesync import ( "math/rand/v2" + "strconv" "time" "github.com/beevik/ntp" ) -func (t *TimeSync) queryNetworkTime() (now *time.Time) { +var defaultNTPServers = []string{ + "time.apple.com", + "time.aws.com", + "time.windows.com", + "time.google.com", + "162.159.200.123", // time.cloudflare.com + "0.pool.ntp.org", + "1.pool.ntp.org", + "2.pool.ntp.org", + "3.pool.ntp.org", +} + +func (t *TimeSync) queryNetworkTime() (now *time.Time, offset *time.Duration) { chunkSize := 4 ntpServers := t.ntpServers @@ -16,27 +29,58 @@ func (t *TimeSync) queryNetworkTime() (now *time.Time) { for i := 0; i < len(ntpServers); i += chunkSize { chunk := ntpServers[i:min(i+chunkSize, len(ntpServers))] - results := t.queryMultipleNTP(chunk, timeSyncTimeout) - if results != nil { - return results + now, offset := t.queryMultipleNTP(chunk, timeSyncTimeout) + if now != nil { + return now, offset } } - return nil + return nil, nil } -func (t *TimeSync) queryMultipleNTP(servers []string, timeout time.Duration) (now *time.Time) { - results := make(chan *time.Time, len(servers)) +type ntpResult struct { + now *time.Time + offset *time.Duration +} +func (t *TimeSync) queryMultipleNTP(servers []string, timeout time.Duration) (now *time.Time, offset *time.Duration) { + results := make(chan *ntpResult, len(servers)) for _, server := range servers { go func(server string) { scopedLogger := t.l.With(). Str("server", server). Logger() + // increase request count + metricNtpTotalRequestCount.Inc() + metricNtpRequestCount.WithLabelValues(server).Inc() + + // query the server now, err, response := queryNtpServer(server, timeout) + // set the last RTT + metricNtpServerLastRTT.WithLabelValues( + server, + ).Set(float64(response.RTT.Milliseconds())) + + // set the RTT histogram + metricNtpServerRttHistogram.WithLabelValues( + server, + ).Observe(float64(response.RTT.Milliseconds())) + + // set the server info + metricNtpServerInfo.WithLabelValues( + server, + response.ReferenceString(), + strconv.Itoa(int(response.Stratum)), + strconv.Itoa(int(response.Precision)), + ).Set(1) + if err == nil { + // increase success count + metricNtpTotalSuccessCount.Inc() + metricNtpSuccessCount.WithLabelValues(server).Inc() + scopedLogger.Info(). Str("time", now.Format(time.RFC3339)). Str("reference", response.ReferenceString()). @@ -44,7 +88,10 @@ func (t *TimeSync) queryMultipleNTP(servers []string, timeout time.Duration) (no Str("clockOffset", response.ClockOffset.String()). Uint8("stratum", response.Stratum). Msg("NTP server returned time") - results <- now + results <- &ntpResult{ + now: now, + offset: &response.ClockOffset, + } } else { scopedLogger.Warn(). Str("error", err.Error()). @@ -53,7 +100,8 @@ func (t *TimeSync) queryMultipleNTP(servers []string, timeout time.Duration) (no }(server) } - return <-results + result := <-results + return result.now, result.offset } func queryNtpServer(server string, timeout time.Duration) (now *time.Time, err error, response *ntp.Response) { diff --git a/internal/timesync/rtc_linux.go b/internal/timesync/rtc_linux.go index dccfab2..27e4ec7 100644 --- a/internal/timesync/rtc_linux.go +++ b/internal/timesync/rtc_linux.go @@ -99,5 +99,7 @@ func (t *TimeSync) setRtcTime(tu time.Time) error { return fmt.Errorf("failed to set RTC time: %w", err) } + metricRTCUpdateCount.Inc() + return nil } diff --git a/internal/timesync/timesync.go b/internal/timesync/timesync.go index 1447201..88d6e9d 100644 --- a/internal/timesync/timesync.go +++ b/internal/timesync/timesync.go @@ -7,6 +7,7 @@ import ( "sync" "time" + "github.com/jetkvm/kvm/internal/network" "github.com/rs/zerolog" ) @@ -27,8 +28,9 @@ type TimeSync struct { syncLock *sync.Mutex l *zerolog.Logger - ntpServers []string - httpUrls []string + ntpServers []string + httpUrls []string + networkConfig *network.NetworkConfig rtcDevicePath string rtcDevice *os.File @@ -39,27 +41,37 @@ type TimeSync struct { preCheckFunc func() (bool, error) } -func NewTimeSync( - precheckFunc func() (bool, error), - ntpServers []string, - httpUrls []string, - logger *zerolog.Logger, -) *TimeSync { +type TimeSyncOptions struct { + PreCheckFunc func() (bool, error) + Logger *zerolog.Logger + NetworkConfig *network.NetworkConfig +} + +type SyncMode struct { + Ntp bool + Http bool + Ordering []string + NtpUseFallback bool + HttpUseFallback bool +} + +func NewTimeSync(opts *TimeSyncOptions) *TimeSync { rtcDevice, err := getRtcDevicePath() if err != nil { - logger.Error().Err(err).Msg("failed to get RTC device path") + opts.Logger.Error().Err(err).Msg("failed to get RTC device path") } else { - logger.Info().Str("path", rtcDevice).Msg("RTC device found") + opts.Logger.Info().Str("path", rtcDevice).Msg("RTC device found") } t := &TimeSync{ syncLock: &sync.Mutex{}, - l: logger, + l: opts.Logger, rtcDevicePath: rtcDevice, rtcLock: &sync.Mutex{}, - preCheckFunc: precheckFunc, - ntpServers: ntpServers, - httpUrls: httpUrls, + preCheckFunc: opts.PreCheckFunc, + ntpServers: defaultNTPServers, + httpUrls: defaultHTTPUrls, + networkConfig: opts.NetworkConfig, } if t.rtcDevicePath != "" { @@ -70,7 +82,36 @@ func NewTimeSync( return t } +func (t *TimeSync) getSyncMode() SyncMode { + syncMode := SyncMode{ + NtpUseFallback: true, + HttpUseFallback: true, + } + var syncModeString string + + if t.networkConfig != nil { + syncModeString = t.networkConfig.TimeSyncMode + if t.networkConfig.TimeSyncDisableFallback { + syncMode.NtpUseFallback = false + syncMode.HttpUseFallback = false + } + } + + switch syncModeString { + case "ntp_only": + syncMode.Ntp = true + case "http_only": + syncMode.Http = true + default: + syncMode.Ntp = true + syncMode.Http = true + } + + return syncMode +} + func (t *TimeSync) doTimeSync() { + metricTimeSyncStatus.Set(0) for { if ok, err := t.preCheckFunc(); !ok { if err != nil { @@ -101,14 +142,27 @@ func (t *TimeSync) doTimeSync() { Str("time_taken", time.Since(start).String()). Msg("time sync successful") + metricTimeSyncStatus.Set(1) + time.Sleep(timeSyncInterval) // after the first sync is done } } func (t *TimeSync) Sync() error { - var now *time.Time - now = t.queryNetworkTime() - if now == nil { + var ( + now *time.Time + offset *time.Duration + ) + + syncMode := t.getSyncMode() + + metricTimeSyncCount.Inc() + + if syncMode.Ntp { + now, offset = t.queryNetworkTime() + } + + if syncMode.Http && now == nil { now = t.queryAllHttpTime() } @@ -116,11 +170,18 @@ func (t *TimeSync) Sync() error { return fmt.Errorf("failed to get time from any source") } + if offset != nil { + newNow := time.Now().Add(*offset) + now = &newNow + } + err := t.setSystemTime(*now) if err != nil { return fmt.Errorf("failed to set system time: %w", err) } + metricTimeSyncSuccessCount.Inc() + return nil } diff --git a/timesync.go b/timesync.go index ed95ffd..7b25fe2 100644 --- a/timesync.go +++ b/timesync.go @@ -8,27 +8,7 @@ import ( ) var ( - timeSync *timesync.TimeSync - defaultNTPServers = []string{ - "time.apple.com", - "time.aws.com", - "time.windows.com", - "time.google.com", - "162.159.200.123", // time.cloudflare.com - "0.pool.ntp.org", - "1.pool.ntp.org", - "2.pool.ntp.org", - "3.pool.ntp.org", - } - defaultHTTPUrls = []string{ - "http://www.gstatic.com/generate_204", - "http://cp.cloudflare.com/", - "http://edge-http.microsoft.com/captiveportal/generate_204", - // Firefox, Apple, and Microsoft have inconsistent results, so we don't use it - // "http://detectportal.firefox.com/", - // "http://www.apple.com/library/test/success.html", - // "http://www.msftconnecttest.com/connecttest.txt", - } + timeSync *timesync.TimeSync builtTimestamp string ) @@ -60,15 +40,14 @@ func isTimeSyncNeeded() bool { } func initTimeSync() { - timeSync = timesync.NewTimeSync( - func() (bool, error) { + timeSync = timesync.NewTimeSync(×ync.TimeSyncOptions{ + Logger: timesyncLogger, + NetworkConfig: config.NetworkConfig, + PreCheckFunc: func() (bool, error) { if !networkState.IsOnline() { return false, nil } return true, nil }, - defaultNTPServers, - defaultHTTPUrls, - timesyncLogger, - ) + }) }