huge refactor
- extendable backends - better project structure - better cli interface
This commit is contained in:
257
monitors/monitor.go
Normal file
257
monitors/monitor.go
Normal file
@@ -0,0 +1,257 @@
|
||||
package monitors
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const DefaultInterval = time.Second * 60
|
||||
const DefaultTimeout = time.Second
|
||||
const HistorySize = 10
|
||||
|
||||
type MonitorStatus string
|
||||
|
||||
const (
|
||||
MonitorStatusUp = "up"
|
||||
MonitorStatusDown = "down"
|
||||
MonitorStatusNotSaturated = "unsaturated"
|
||||
)
|
||||
|
||||
type backendValidateFunc = func(monitor *AbstractMonitor) []string
|
||||
type MonitorTestFunc func() (up bool, errs []error)
|
||||
type MonitorTickFunc func(monitor MonitorInterface, status MonitorStatus, errs []error, lag int64)
|
||||
|
||||
type MonitorInterface interface {
|
||||
Start(MonitorTestFunc, *sync.WaitGroup, MonitorTickFunc, bool)
|
||||
Stop()
|
||||
|
||||
tick(MonitorTestFunc) (status MonitorStatus, errors []error, lag int64)
|
||||
test() (bool, []error)
|
||||
|
||||
Validate(validate backendValidateFunc) []string
|
||||
Describe() []string
|
||||
|
||||
GetMonitor() *AbstractMonitor
|
||||
GetTestFunc() MonitorTestFunc
|
||||
GetLastStatus() MonitorStatus
|
||||
UpdateLastStatus(status MonitorStatus) (old MonitorStatus)
|
||||
}
|
||||
|
||||
// AbstractMonitor data model
|
||||
type AbstractMonitor struct {
|
||||
Name string
|
||||
Target string
|
||||
|
||||
// (default)http / dns
|
||||
Type string
|
||||
Strict bool
|
||||
|
||||
Interval time.Duration
|
||||
Timeout time.Duration
|
||||
Params map[string]interface{}
|
||||
|
||||
// Templating stuff
|
||||
Template MonitorTemplates
|
||||
|
||||
// Threshold = percentage / number of down incidents
|
||||
Threshold float32
|
||||
ThresholdCount bool `mapstructure:"threshold_count"`
|
||||
|
||||
// lag / average(lagHistory) * 100 = percentage above average lag
|
||||
// PerformanceThreshold sets the % limit above which this monitor will trigger degraded-performance
|
||||
// PerformanceThreshold float32
|
||||
|
||||
history []bool
|
||||
lastStatus MonitorStatus
|
||||
|
||||
// Closed when mon.Stop() is called
|
||||
stopC chan bool
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Validate(validate backendValidateFunc) []string {
|
||||
errs := []string{}
|
||||
|
||||
if len(mon.Name) == 0 {
|
||||
errs = append(errs, "Name is required")
|
||||
}
|
||||
|
||||
if mon.Interval < 1 {
|
||||
mon.Interval = DefaultInterval
|
||||
}
|
||||
if mon.Timeout < 1 {
|
||||
mon.Timeout = DefaultTimeout
|
||||
}
|
||||
|
||||
if mon.Timeout > mon.Interval {
|
||||
errs = append(errs, "Timeout greater than interval")
|
||||
}
|
||||
|
||||
// get the backend to validate the monitor
|
||||
errs = append(errs, validate(mon)...)
|
||||
|
||||
if mon.Threshold <= 0 {
|
||||
mon.Threshold = 100
|
||||
}
|
||||
|
||||
// if len(mon.Template.Fixed.Message) == 0 || len(mon.Template.Fixed.Subject) == 0 {
|
||||
// errs = append(errs, "\"fixed\" template empty/missing")
|
||||
// }
|
||||
// if len(mon.Template.Investigating.Message) == 0 || len(mon.Template.Investigating.Subject) == 0 {
|
||||
// errs = append(errs, "\"investigating\" template empty/missing")
|
||||
// }
|
||||
if err := mon.Template.Fixed.Compile(); err != nil {
|
||||
errs = append(errs, "Could not compile \"fixed\" template: "+err.Error())
|
||||
}
|
||||
if err := mon.Template.Investigating.Compile(); err != nil {
|
||||
errs = append(errs, "Could not compile \"investigating\" template: "+err.Error())
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor {
|
||||
return mon
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Describe() []string {
|
||||
features := []string{"Type: " + mon.Type}
|
||||
|
||||
if len(mon.Name) > 0 {
|
||||
features = append(features, "Name: "+mon.Name)
|
||||
}
|
||||
|
||||
return features
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Start(testFunc MonitorTestFunc, wg *sync.WaitGroup, tickFunc MonitorTickFunc, immediate bool) {
|
||||
wg.Add(1)
|
||||
|
||||
mon.stopC = make(chan bool)
|
||||
if immediate {
|
||||
status, errs, lag := mon.tick(testFunc)
|
||||
tickFunc(mon, status, errs, lag)
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(mon.Interval * time.Second)
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
status, errs, lag := mon.tick(testFunc)
|
||||
tickFunc(mon, status, errs, lag)
|
||||
case <-mon.stopC:
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Stop() {
|
||||
select {
|
||||
case <-mon.stopC:
|
||||
return
|
||||
default:
|
||||
close(mon.stopC)
|
||||
}
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) tick(testFunc MonitorTestFunc) (status MonitorStatus, errors []error, lag int64) {
|
||||
reqStart := getMs()
|
||||
up, errs := testFunc()
|
||||
lag = getMs() - reqStart
|
||||
|
||||
histSize := HistorySize
|
||||
if mon.ThresholdCount {
|
||||
histSize = int(mon.Threshold)
|
||||
}
|
||||
|
||||
if len(mon.history) == histSize-1 {
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"monitor": mon.Name,
|
||||
}).Warn("monitor saturated")
|
||||
}
|
||||
if len(mon.history) >= histSize {
|
||||
mon.history = mon.history[len(mon.history)-(histSize-1):]
|
||||
}
|
||||
mon.history = append(mon.history, up)
|
||||
status = mon.GetStatus()
|
||||
errors = errs
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
|
||||
func (mon *AbstractMonitor) GetStatus() MonitorStatus {
|
||||
numDown := 0
|
||||
for _, wasUp := range mon.history {
|
||||
if wasUp == false {
|
||||
numDown++
|
||||
}
|
||||
}
|
||||
|
||||
t := (float32(numDown) / float32(len(mon.history))) * 100
|
||||
logFields := logrus.Fields{"monitor": mon.Name}
|
||||
// stop reporting time for jsonformatter, it's there by default
|
||||
if _, ok := logrus.StandardLogger().Formatter.(*logrus.JSONFormatter); !ok {
|
||||
logFields["t"] = time.Now()
|
||||
}
|
||||
l := logrus.WithFields(logFields)
|
||||
|
||||
symbol := "⚠️"
|
||||
if t == 100 {
|
||||
symbol = "❌"
|
||||
}
|
||||
if numDown == 0 {
|
||||
l.Printf("👍 up")
|
||||
} else if mon.ThresholdCount {
|
||||
l.Printf("%v down (%d/%d)", symbol, numDown, int(mon.Threshold))
|
||||
} else {
|
||||
l.Printf("%v down %.0f%%/%.0f%%", symbol, t, mon.Threshold)
|
||||
}
|
||||
|
||||
histSize := HistorySize
|
||||
if mon.ThresholdCount {
|
||||
histSize = int(mon.Threshold)
|
||||
}
|
||||
|
||||
if len(mon.history) != histSize {
|
||||
// not saturated
|
||||
return MonitorStatusNotSaturated
|
||||
}
|
||||
|
||||
var down bool
|
||||
if mon.ThresholdCount {
|
||||
down = numDown >= int(mon.Threshold)
|
||||
} else {
|
||||
down = t >= mon.Threshold
|
||||
}
|
||||
|
||||
if !down {
|
||||
return MonitorStatusUp
|
||||
}
|
||||
|
||||
return MonitorStatusDown
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) GetTestFunc() MonitorTestFunc {
|
||||
return mon.test
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) GetLastStatus() MonitorStatus {
|
||||
return mon.lastStatus
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) UpdateLastStatus(status MonitorStatus) (old MonitorStatus) {
|
||||
old = mon.lastStatus
|
||||
mon.lastStatus = status
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) test() (bool, []error) { return false, nil }
|
||||
|
||||
func getMs() int64 {
|
||||
return time.Now().UnixNano() / int64(time.Millisecond)
|
||||
}
|
||||
Reference in New Issue
Block a user