package monitors import ( "sync" "time" "github.com/sirupsen/logrus" ) const DefaultInterval = time.Second * 60 const DefaultTimeout = time.Second const HistorySize = 10 type MonitorStatus string const ( MonitorStatusUp = "up" MonitorStatusDown = "down" MonitorStatusNotSaturated = "unsaturated" ) type backendValidateFunc = func(monitor *AbstractMonitor) []string type MonitorTestFunc func() (up bool, errs []error) type MonitorTickFunc func(monitor MonitorInterface, status MonitorStatus, errs []error, lag int64) type MonitorInterface interface { Start(MonitorTestFunc, *sync.WaitGroup, MonitorTickFunc, bool) Stop() tick(MonitorTestFunc) (status MonitorStatus, errors []error, lag int64) test() (bool, []error) Validate(validate backendValidateFunc) []string Describe() []string GetMonitor() *AbstractMonitor GetTestFunc() MonitorTestFunc GetLastStatus() MonitorStatus UpdateLastStatus(status MonitorStatus) (old MonitorStatus) } // AbstractMonitor data model type AbstractMonitor struct { Name string Target string // (default)http / dns Type string Strict bool Interval time.Duration Timeout time.Duration Params map[string]interface{} // Templating stuff Template MonitorTemplates // Threshold = percentage / number of down incidents Threshold float32 ThresholdCount bool `mapstructure:"threshold_count"` // lag / average(lagHistory) * 100 = percentage above average lag // PerformanceThreshold sets the % limit above which this monitor will trigger degraded-performance // PerformanceThreshold float32 history []bool lastStatus MonitorStatus // Closed when mon.Stop() is called stopC chan bool } func (mon *AbstractMonitor) Validate(validate backendValidateFunc) []string { errs := []string{} if len(mon.Name) == 0 { errs = append(errs, "Name is required") } if mon.Interval < 1 { mon.Interval = DefaultInterval } if mon.Timeout < 1 { mon.Timeout = DefaultTimeout } if mon.Timeout > mon.Interval { errs = append(errs, "Timeout greater than interval") } // get the backend to validate the monitor errs = append(errs, validate(mon)...) if mon.Threshold <= 0 { mon.Threshold = 100 } // if len(mon.Template.Fixed.Message) == 0 || len(mon.Template.Fixed.Subject) == 0 { // errs = append(errs, "\"fixed\" template empty/missing") // } // if len(mon.Template.Investigating.Message) == 0 || len(mon.Template.Investigating.Subject) == 0 { // errs = append(errs, "\"investigating\" template empty/missing") // } if err := mon.Template.Fixed.Compile(); err != nil { errs = append(errs, "Could not compile \"fixed\" template: "+err.Error()) } if err := mon.Template.Investigating.Compile(); err != nil { errs = append(errs, "Could not compile \"investigating\" template: "+err.Error()) } return errs } func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor { return mon } func (mon *AbstractMonitor) Describe() []string { features := []string{"Type: " + mon.Type} if len(mon.Name) > 0 { features = append(features, "Name: "+mon.Name) } return features } func (mon *AbstractMonitor) Start(testFunc MonitorTestFunc, wg *sync.WaitGroup, tickFunc MonitorTickFunc, immediate bool) { wg.Add(1) mon.stopC = make(chan bool) if immediate { status, errs, lag := mon.tick(testFunc) tickFunc(mon, status, errs, lag) } ticker := time.NewTicker(mon.Interval * time.Second) for { select { case <-ticker.C: status, errs, lag := mon.tick(testFunc) tickFunc(mon, status, errs, lag) case <-mon.stopC: wg.Done() return } } } func (mon *AbstractMonitor) Stop() { select { case <-mon.stopC: return default: close(mon.stopC) } } func (mon *AbstractMonitor) tick(testFunc MonitorTestFunc) (status MonitorStatus, errors []error, lag int64) { reqStart := getMs() up, errs := testFunc() lag = getMs() - reqStart histSize := HistorySize if mon.ThresholdCount { histSize = int(mon.Threshold) } if len(mon.history) == histSize-1 { logrus.WithFields(logrus.Fields{ "monitor": mon.Name, }).Warn("monitor saturated") } if len(mon.history) >= histSize { mon.history = mon.history[len(mon.history)-(histSize-1):] } mon.history = append(mon.history, up) status = mon.GetStatus() errors = errs return } // TODO: test // AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident func (mon *AbstractMonitor) GetStatus() MonitorStatus { numDown := 0 for _, wasUp := range mon.history { if wasUp == false { numDown++ } } t := (float32(numDown) / float32(len(mon.history))) * 100 logFields := logrus.Fields{"monitor": mon.Name} // stop reporting time for jsonformatter, it's there by default if _, ok := logrus.StandardLogger().Formatter.(*logrus.JSONFormatter); !ok { logFields["t"] = time.Now() } l := logrus.WithFields(logFields) symbol := "⚠️" if t == 100 { symbol = "❌" } if numDown == 0 { l.Printf("👍 up") } else if mon.ThresholdCount { l.Printf("%v down (%d/%d)", symbol, numDown, int(mon.Threshold)) } else { l.Printf("%v down %.0f%%/%.0f%%", symbol, t, mon.Threshold) } histSize := HistorySize if mon.ThresholdCount { histSize = int(mon.Threshold) } if len(mon.history) != histSize { // not saturated return MonitorStatusNotSaturated } var down bool if mon.ThresholdCount { down = numDown >= int(mon.Threshold) } else { down = t >= mon.Threshold } if !down { return MonitorStatusUp } return MonitorStatusDown } func (mon *AbstractMonitor) GetTestFunc() MonitorTestFunc { return mon.test } func (mon *AbstractMonitor) GetLastStatus() MonitorStatus { return mon.lastStatus } func (mon *AbstractMonitor) UpdateLastStatus(status MonitorStatus) (old MonitorStatus) { old = mon.lastStatus mon.lastStatus = status return } func (mon *AbstractMonitor) test() (bool, []error) { return false, nil } func getMs() int64 { return time.Now().UnixNano() / int64(time.Millisecond) }