feat(timesync): add metrics

This commit is contained in:
Siyuan Miao 2025-04-15 00:46:01 +02:00
parent 4fe8cb4789
commit 5614b26a38
6 changed files with 319 additions and 55 deletions

View File

@ -5,9 +5,20 @@ import (
"errors" "errors"
"math/rand" "math/rand"
"net/http" "net/http"
"strconv"
"time" "time"
) )
var defaultHTTPUrls = []string{
"http://www.gstatic.com/generate_204",
"http://cp.cloudflare.com/",
"http://edge-http.microsoft.com/captiveportal/generate_204",
// Firefox, Apple, and Microsoft have inconsistent results, so we don't use it
// "http://detectportal.firefox.com/",
// "http://www.apple.com/library/test/success.html",
// "http://www.msftconnecttest.com/connecttest.txt",
}
func (t *TimeSync) queryAllHttpTime() (now *time.Time) { func (t *TimeSync) queryAllHttpTime() (now *time.Time) {
chunkSize := 4 chunkSize := 4
httpUrls := t.httpUrls httpUrls := t.httpUrls
@ -38,6 +49,9 @@ func (t *TimeSync) queryMultipleHttp(urls []string, timeout time.Duration) (now
Str("http_url", url). Str("http_url", url).
Logger() Logger()
metricHttpRequestCount.WithLabelValues(url).Inc()
metricHttpTotalRequestCount.Inc()
startTime := time.Now() startTime := time.Now()
now, err, response := queryHttpTime( now, err, response := queryHttpTime(
ctx, ctx,
@ -46,12 +60,22 @@ func (t *TimeSync) queryMultipleHttp(urls []string, timeout time.Duration) (now
) )
duration := time.Since(startTime) duration := time.Since(startTime)
var status int metricHttpServerLastRTT.WithLabelValues(url).Set(float64(duration.Milliseconds()))
metricHttpServerRttHistogram.WithLabelValues(url).Observe(float64(duration.Milliseconds()))
status := 0
if response != nil { if response != nil {
status = response.StatusCode status = response.StatusCode
} }
metricHttpServerInfo.WithLabelValues(
url,
strconv.Itoa(status),
).Set(1)
if err == nil { if err == nil {
metricHttpTotalSuccessCount.Inc()
metricHttpSuccessCount.WithLabelValues(url).Inc()
requestId := response.Header.Get("X-Request-Id") requestId := response.Header.Get("X-Request-Id")
if requestId != "" { if requestId != "" {
requestId = response.Header.Get("X-Msedge-Ref") requestId = response.Header.Get("X-Msedge-Ref")
@ -68,7 +92,10 @@ func (t *TimeSync) queryMultipleHttp(urls []string, timeout time.Duration) (now
cancel() cancel()
results <- now results <- now
} else if !errors.Is(err, context.Canceled) { } else if errors.Is(err, context.Canceled) {
metricHttpCancelCount.WithLabelValues(url).Inc()
metricHttpTotalCancelCount.Inc()
} else {
scopedLogger.Warn(). scopedLogger.Warn().
Str("error", err.Error()). Str("error", err.Error()).
Int("status", status). Int("status", status).

View File

@ -0,0 +1,147 @@
package timesync
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
metricTimeSyncStatus = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "jetkvm_timesync_status",
Help: "The status of the timesync, 1 if successful, 0 if not",
},
)
metricTimeSyncCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "jetkvm_timesync_count",
Help: "The number of times the timesync has been run",
},
)
metricTimeSyncSuccessCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "jetkvm_timesync_success_count",
Help: "The number of times the timesync has been successful",
},
)
metricRTCUpdateCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "jetkvm_timesync_rtc_update_count",
Help: "The number of times the RTC has been updated",
},
)
metricNtpTotalSuccessCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "jetkvm_timesync_ntp_total_success_count",
Help: "The total number of successful NTP requests",
},
)
metricNtpTotalRequestCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "jetkvm_timesync_ntp_total_request_count",
Help: "The total number of NTP requests sent",
},
)
metricNtpSuccessCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "jetkvm_timesync_ntp_success_count",
Help: "The number of successful NTP requests",
},
[]string{"url"},
)
metricNtpRequestCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "jetkvm_timesync_ntp_request_count",
Help: "The number of NTP requests sent to the server",
},
[]string{"url"},
)
metricNtpServerLastRTT = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "jetkvm_timesync_ntp_server_last_rtt",
Help: "The last RTT of the NTP server in milliseconds",
},
[]string{"url"},
)
metricNtpServerRttHistogram = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "jetkvm_timesync_ntp_server_rtt",
Help: "The histogram of the RTT of the NTP server in milliseconds",
Buckets: []float64{
10, 25, 50, 100, 200, 300, 500, 1000,
},
},
[]string{"url"},
)
metricNtpServerInfo = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "jetkvm_timesync_ntp_server_info",
Help: "The info of the NTP server",
},
[]string{"url", "reference", "stratum", "precision"},
)
metricHttpTotalSuccessCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "jetkvm_timesync_http_total_success_count",
Help: "The total number of successful HTTP requests",
},
)
metricHttpTotalRequestCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "jetkvm_timesync_http_total_request_count",
Help: "The total number of HTTP requests sent",
},
)
metricHttpTotalCancelCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "jetkvm_timesync_http_total_cancel_count",
Help: "The total number of HTTP requests cancelled",
},
)
metricHttpSuccessCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "jetkvm_timesync_http_success_count",
Help: "The number of successful HTTP requests",
},
[]string{"url"},
)
metricHttpRequestCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "jetkvm_timesync_http_request_count",
Help: "The number of HTTP requests sent to the server",
},
[]string{"url"},
)
metricHttpCancelCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "jetkvm_timesync_http_cancel_count",
Help: "The number of HTTP requests cancelled",
},
[]string{"url"},
)
metricHttpServerLastRTT = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "jetkvm_timesync_http_server_last_rtt",
Help: "The last RTT of the HTTP server in milliseconds",
},
[]string{"url"},
)
metricHttpServerRttHistogram = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "jetkvm_timesync_http_server_rtt",
Help: "The histogram of the RTT of the HTTP server in milliseconds",
Buckets: []float64{
10, 25, 50, 100, 200, 300, 500, 1000,
},
},
[]string{"url"},
)
metricHttpServerInfo = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "jetkvm_timesync_http_server_info",
Help: "The info of the HTTP server",
},
[]string{"url", "http_code"},
)
)

View File

@ -2,12 +2,25 @@ package timesync
import ( import (
"math/rand/v2" "math/rand/v2"
"strconv"
"time" "time"
"github.com/beevik/ntp" "github.com/beevik/ntp"
) )
func (t *TimeSync) queryNetworkTime() (now *time.Time) { var defaultNTPServers = []string{
"time.apple.com",
"time.aws.com",
"time.windows.com",
"time.google.com",
"162.159.200.123", // time.cloudflare.com
"0.pool.ntp.org",
"1.pool.ntp.org",
"2.pool.ntp.org",
"3.pool.ntp.org",
}
func (t *TimeSync) queryNetworkTime() (now *time.Time, offset *time.Duration) {
chunkSize := 4 chunkSize := 4
ntpServers := t.ntpServers ntpServers := t.ntpServers
@ -16,27 +29,58 @@ func (t *TimeSync) queryNetworkTime() (now *time.Time) {
for i := 0; i < len(ntpServers); i += chunkSize { for i := 0; i < len(ntpServers); i += chunkSize {
chunk := ntpServers[i:min(i+chunkSize, len(ntpServers))] chunk := ntpServers[i:min(i+chunkSize, len(ntpServers))]
results := t.queryMultipleNTP(chunk, timeSyncTimeout) now, offset := t.queryMultipleNTP(chunk, timeSyncTimeout)
if results != nil { if now != nil {
return results return now, offset
} }
} }
return nil return nil, nil
} }
func (t *TimeSync) queryMultipleNTP(servers []string, timeout time.Duration) (now *time.Time) { type ntpResult struct {
results := make(chan *time.Time, len(servers)) now *time.Time
offset *time.Duration
}
func (t *TimeSync) queryMultipleNTP(servers []string, timeout time.Duration) (now *time.Time, offset *time.Duration) {
results := make(chan *ntpResult, len(servers))
for _, server := range servers { for _, server := range servers {
go func(server string) { go func(server string) {
scopedLogger := t.l.With(). scopedLogger := t.l.With().
Str("server", server). Str("server", server).
Logger() Logger()
// increase request count
metricNtpTotalRequestCount.Inc()
metricNtpRequestCount.WithLabelValues(server).Inc()
// query the server
now, err, response := queryNtpServer(server, timeout) now, err, response := queryNtpServer(server, timeout)
// set the last RTT
metricNtpServerLastRTT.WithLabelValues(
server,
).Set(float64(response.RTT.Milliseconds()))
// set the RTT histogram
metricNtpServerRttHistogram.WithLabelValues(
server,
).Observe(float64(response.RTT.Milliseconds()))
// set the server info
metricNtpServerInfo.WithLabelValues(
server,
response.ReferenceString(),
strconv.Itoa(int(response.Stratum)),
strconv.Itoa(int(response.Precision)),
).Set(1)
if err == nil { if err == nil {
// increase success count
metricNtpTotalSuccessCount.Inc()
metricNtpSuccessCount.WithLabelValues(server).Inc()
scopedLogger.Info(). scopedLogger.Info().
Str("time", now.Format(time.RFC3339)). Str("time", now.Format(time.RFC3339)).
Str("reference", response.ReferenceString()). Str("reference", response.ReferenceString()).
@ -44,7 +88,10 @@ func (t *TimeSync) queryMultipleNTP(servers []string, timeout time.Duration) (no
Str("clockOffset", response.ClockOffset.String()). Str("clockOffset", response.ClockOffset.String()).
Uint8("stratum", response.Stratum). Uint8("stratum", response.Stratum).
Msg("NTP server returned time") Msg("NTP server returned time")
results <- now results <- &ntpResult{
now: now,
offset: &response.ClockOffset,
}
} else { } else {
scopedLogger.Warn(). scopedLogger.Warn().
Str("error", err.Error()). Str("error", err.Error()).
@ -53,7 +100,8 @@ func (t *TimeSync) queryMultipleNTP(servers []string, timeout time.Duration) (no
}(server) }(server)
} }
return <-results result := <-results
return result.now, result.offset
} }
func queryNtpServer(server string, timeout time.Duration) (now *time.Time, err error, response *ntp.Response) { func queryNtpServer(server string, timeout time.Duration) (now *time.Time, err error, response *ntp.Response) {

View File

@ -99,5 +99,7 @@ func (t *TimeSync) setRtcTime(tu time.Time) error {
return fmt.Errorf("failed to set RTC time: %w", err) return fmt.Errorf("failed to set RTC time: %w", err)
} }
metricRTCUpdateCount.Inc()
return nil return nil
} }

View File

@ -7,6 +7,7 @@ import (
"sync" "sync"
"time" "time"
"github.com/jetkvm/kvm/internal/network"
"github.com/rs/zerolog" "github.com/rs/zerolog"
) )
@ -27,8 +28,9 @@ type TimeSync struct {
syncLock *sync.Mutex syncLock *sync.Mutex
l *zerolog.Logger l *zerolog.Logger
ntpServers []string ntpServers []string
httpUrls []string httpUrls []string
networkConfig *network.NetworkConfig
rtcDevicePath string rtcDevicePath string
rtcDevice *os.File rtcDevice *os.File
@ -39,27 +41,37 @@ type TimeSync struct {
preCheckFunc func() (bool, error) preCheckFunc func() (bool, error)
} }
func NewTimeSync( type TimeSyncOptions struct {
precheckFunc func() (bool, error), PreCheckFunc func() (bool, error)
ntpServers []string, Logger *zerolog.Logger
httpUrls []string, NetworkConfig *network.NetworkConfig
logger *zerolog.Logger, }
) *TimeSync {
type SyncMode struct {
Ntp bool
Http bool
Ordering []string
NtpUseFallback bool
HttpUseFallback bool
}
func NewTimeSync(opts *TimeSyncOptions) *TimeSync {
rtcDevice, err := getRtcDevicePath() rtcDevice, err := getRtcDevicePath()
if err != nil { if err != nil {
logger.Error().Err(err).Msg("failed to get RTC device path") opts.Logger.Error().Err(err).Msg("failed to get RTC device path")
} else { } else {
logger.Info().Str("path", rtcDevice).Msg("RTC device found") opts.Logger.Info().Str("path", rtcDevice).Msg("RTC device found")
} }
t := &TimeSync{ t := &TimeSync{
syncLock: &sync.Mutex{}, syncLock: &sync.Mutex{},
l: logger, l: opts.Logger,
rtcDevicePath: rtcDevice, rtcDevicePath: rtcDevice,
rtcLock: &sync.Mutex{}, rtcLock: &sync.Mutex{},
preCheckFunc: precheckFunc, preCheckFunc: opts.PreCheckFunc,
ntpServers: ntpServers, ntpServers: defaultNTPServers,
httpUrls: httpUrls, httpUrls: defaultHTTPUrls,
networkConfig: opts.NetworkConfig,
} }
if t.rtcDevicePath != "" { if t.rtcDevicePath != "" {
@ -70,7 +82,36 @@ func NewTimeSync(
return t return t
} }
func (t *TimeSync) getSyncMode() SyncMode {
syncMode := SyncMode{
NtpUseFallback: true,
HttpUseFallback: true,
}
var syncModeString string
if t.networkConfig != nil {
syncModeString = t.networkConfig.TimeSyncMode
if t.networkConfig.TimeSyncDisableFallback {
syncMode.NtpUseFallback = false
syncMode.HttpUseFallback = false
}
}
switch syncModeString {
case "ntp_only":
syncMode.Ntp = true
case "http_only":
syncMode.Http = true
default:
syncMode.Ntp = true
syncMode.Http = true
}
return syncMode
}
func (t *TimeSync) doTimeSync() { func (t *TimeSync) doTimeSync() {
metricTimeSyncStatus.Set(0)
for { for {
if ok, err := t.preCheckFunc(); !ok { if ok, err := t.preCheckFunc(); !ok {
if err != nil { if err != nil {
@ -101,14 +142,27 @@ func (t *TimeSync) doTimeSync() {
Str("time_taken", time.Since(start).String()). Str("time_taken", time.Since(start).String()).
Msg("time sync successful") Msg("time sync successful")
metricTimeSyncStatus.Set(1)
time.Sleep(timeSyncInterval) // after the first sync is done time.Sleep(timeSyncInterval) // after the first sync is done
} }
} }
func (t *TimeSync) Sync() error { func (t *TimeSync) Sync() error {
var now *time.Time var (
now = t.queryNetworkTime() now *time.Time
if now == nil { offset *time.Duration
)
syncMode := t.getSyncMode()
metricTimeSyncCount.Inc()
if syncMode.Ntp {
now, offset = t.queryNetworkTime()
}
if syncMode.Http && now == nil {
now = t.queryAllHttpTime() now = t.queryAllHttpTime()
} }
@ -116,11 +170,18 @@ func (t *TimeSync) Sync() error {
return fmt.Errorf("failed to get time from any source") return fmt.Errorf("failed to get time from any source")
} }
if offset != nil {
newNow := time.Now().Add(*offset)
now = &newNow
}
err := t.setSystemTime(*now) err := t.setSystemTime(*now)
if err != nil { if err != nil {
return fmt.Errorf("failed to set system time: %w", err) return fmt.Errorf("failed to set system time: %w", err)
} }
metricTimeSyncSuccessCount.Inc()
return nil return nil
} }

View File

@ -8,27 +8,7 @@ import (
) )
var ( var (
timeSync *timesync.TimeSync timeSync *timesync.TimeSync
defaultNTPServers = []string{
"time.apple.com",
"time.aws.com",
"time.windows.com",
"time.google.com",
"162.159.200.123", // time.cloudflare.com
"0.pool.ntp.org",
"1.pool.ntp.org",
"2.pool.ntp.org",
"3.pool.ntp.org",
}
defaultHTTPUrls = []string{
"http://www.gstatic.com/generate_204",
"http://cp.cloudflare.com/",
"http://edge-http.microsoft.com/captiveportal/generate_204",
// Firefox, Apple, and Microsoft have inconsistent results, so we don't use it
// "http://detectportal.firefox.com/",
// "http://www.apple.com/library/test/success.html",
// "http://www.msftconnecttest.com/connecttest.txt",
}
builtTimestamp string builtTimestamp string
) )
@ -60,15 +40,14 @@ func isTimeSyncNeeded() bool {
} }
func initTimeSync() { func initTimeSync() {
timeSync = timesync.NewTimeSync( timeSync = timesync.NewTimeSync(&timesync.TimeSyncOptions{
func() (bool, error) { Logger: timesyncLogger,
NetworkConfig: config.NetworkConfig,
PreCheckFunc: func() (bool, error) {
if !networkState.IsOnline() { if !networkState.IsOnline() {
return false, nil return false, nil
} }
return true, nil return true, nil
}, },
defaultNTPServers, })
defaultHTTPUrls,
timesyncLogger,
)
} }