huge refactor

- extendable backends - better project structure - better cli interface
2019-02-20 11:14:45 +08:00
parent df31238a1f
commit 162d55b3f3
17 changed files with 946 additions and 705 deletions
--- a/monitors/monitor.go
+++ b/monitors/monitor.go
@@ -0,0 +1,257 @@
+package monitors
+
+import (
+	"sync"
+	"time"
+
+	"github.com/sirupsen/logrus"
+)
+
+const DefaultInterval = time.Second * 60
+const DefaultTimeout = time.Second
+const HistorySize = 10
+
+type MonitorStatus string
+
+const (
+	MonitorStatusUp           = "up"
+	MonitorStatusDown         = "down"
+	MonitorStatusNotSaturated = "unsaturated"
+)
+
+type backendValidateFunc = func(monitor *AbstractMonitor) []string
+type MonitorTestFunc func() (up bool, errs []error)
+type MonitorTickFunc func(monitor MonitorInterface, status MonitorStatus, errs []error, lag int64)
+
+type MonitorInterface interface {
+	Start(MonitorTestFunc, *sync.WaitGroup, MonitorTickFunc, bool)
+	Stop()
+
+	tick(MonitorTestFunc) (status MonitorStatus, errors []error, lag int64)
+	test() (bool, []error)
+
+	Validate(validate backendValidateFunc) []string
+	Describe() []string
+
+	GetMonitor() *AbstractMonitor
+	GetTestFunc() MonitorTestFunc
+	GetLastStatus() MonitorStatus
+	UpdateLastStatus(status MonitorStatus) (old MonitorStatus)
+}
+
+// AbstractMonitor data model
+type AbstractMonitor struct {
+	Name   string
+	Target string
+
+	// (default)http / dns
+	Type   string
+	Strict bool
+
+	Interval time.Duration
+	Timeout  time.Duration
+	Params   map[string]interface{}
+
+	// Templating stuff
+	Template MonitorTemplates
+
+	// Threshold = percentage / number of down incidents
+	Threshold      float32
+	ThresholdCount bool `mapstructure:"threshold_count"`
+
+	// lag / average(lagHistory) * 100 = percentage above average lag
+	// PerformanceThreshold sets the % limit above which this monitor will trigger degraded-performance
+	// PerformanceThreshold float32
+
+	history    []bool
+	lastStatus MonitorStatus
+
+	// Closed when mon.Stop() is called
+	stopC chan bool
+}
+
+func (mon *AbstractMonitor) Validate(validate backendValidateFunc) []string {
+	errs := []string{}
+
+	if len(mon.Name) == 0 {
+		errs = append(errs, "Name is required")
+	}
+
+	if mon.Interval < 1 {
+		mon.Interval = DefaultInterval
+	}
+	if mon.Timeout < 1 {
+		mon.Timeout = DefaultTimeout
+	}
+
+	if mon.Timeout > mon.Interval {
+		errs = append(errs, "Timeout greater than interval")
+	}
+
+	// get the backend to validate the monitor
+	errs = append(errs, validate(mon)...)
+
+	if mon.Threshold <= 0 {
+		mon.Threshold = 100
+	}
+
+	// if len(mon.Template.Fixed.Message) == 0 || len(mon.Template.Fixed.Subject) == 0 {
+	// 	errs = append(errs, "\"fixed\" template empty/missing")
+	// }
+	// if len(mon.Template.Investigating.Message) == 0 || len(mon.Template.Investigating.Subject) == 0 {
+	// 	errs = append(errs, "\"investigating\" template empty/missing")
+	// }
+	if err := mon.Template.Fixed.Compile(); err != nil {
+		errs = append(errs, "Could not compile \"fixed\" template: "+err.Error())
+	}
+	if err := mon.Template.Investigating.Compile(); err != nil {
+		errs = append(errs, "Could not compile \"investigating\" template: "+err.Error())
+	}
+
+	return errs
+}
+
+func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor {
+	return mon
+}
+
+func (mon *AbstractMonitor) Describe() []string {
+	features := []string{"Type: " + mon.Type}
+
+	if len(mon.Name) > 0 {
+		features = append(features, "Name: "+mon.Name)
+	}
+
+	return features
+}
+
+func (mon *AbstractMonitor) Start(testFunc MonitorTestFunc, wg *sync.WaitGroup, tickFunc MonitorTickFunc, immediate bool) {
+	wg.Add(1)
+
+	mon.stopC = make(chan bool)
+	if immediate {
+		status, errs, lag := mon.tick(testFunc)
+		tickFunc(mon, status, errs, lag)
+	}
+
+	ticker := time.NewTicker(mon.Interval * time.Second)
+	for {
+		select {
+		case <-ticker.C:
+			status, errs, lag := mon.tick(testFunc)
+			tickFunc(mon, status, errs, lag)
+		case <-mon.stopC:
+			wg.Done()
+			return
+		}
+	}
+}
+
+func (mon *AbstractMonitor) Stop() {
+	select {
+	case <-mon.stopC:
+		return
+	default:
+		close(mon.stopC)
+	}
+}
+
+func (mon *AbstractMonitor) tick(testFunc MonitorTestFunc) (status MonitorStatus, errors []error, lag int64) {
+	reqStart := getMs()
+	up, errs := testFunc()
+	lag = getMs() - reqStart
+
+	histSize := HistorySize
+	if mon.ThresholdCount {
+		histSize = int(mon.Threshold)
+	}
+
+	if len(mon.history) == histSize-1 {
+		logrus.WithFields(logrus.Fields{
+			"monitor": mon.Name,
+		}).Warn("monitor saturated")
+	}
+	if len(mon.history) >= histSize {
+		mon.history = mon.history[len(mon.history)-(histSize-1):]
+	}
+	mon.history = append(mon.history, up)
+	status = mon.GetStatus()
+	errors = errs
+
+	return
+}
+
+// TODO: test
+// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
+func (mon *AbstractMonitor) GetStatus() MonitorStatus {
+	numDown := 0
+	for _, wasUp := range mon.history {
+		if wasUp == false {
+			numDown++
+		}
+	}
+
+	t := (float32(numDown) / float32(len(mon.history))) * 100
+	logFields := logrus.Fields{"monitor": mon.Name}
+	// stop reporting time for jsonformatter, it's there by default
+	if _, ok := logrus.StandardLogger().Formatter.(*logrus.JSONFormatter); !ok {
+		logFields["t"] = time.Now()
+	}
+	l := logrus.WithFields(logFields)
+
+	symbol := "⚠️"
+	if t == 100 {
+		symbol = "❌"
+	}
+	if numDown == 0 {
+		l.Printf("👍 up")
+	} else if mon.ThresholdCount {
+		l.Printf("%v down (%d/%d)", symbol, numDown, int(mon.Threshold))
+	} else {
+		l.Printf("%v down %.0f%%/%.0f%%", symbol, t, mon.Threshold)
+	}
+
+	histSize := HistorySize
+	if mon.ThresholdCount {
+		histSize = int(mon.Threshold)
+	}
+
+	if len(mon.history) != histSize {
+		// not saturated
+		return MonitorStatusNotSaturated
+	}
+
+	var down bool
+	if mon.ThresholdCount {
+		down = numDown >= int(mon.Threshold)
+	} else {
+		down = t >= mon.Threshold
+	}
+
+	if !down {
+		return MonitorStatusUp
+	}
+
+	return MonitorStatusDown
+}
+
+func (mon *AbstractMonitor) GetTestFunc() MonitorTestFunc {
+	return mon.test
+}
+
+func (mon *AbstractMonitor) GetLastStatus() MonitorStatus {
+	return mon.lastStatus
+}
+
+func (mon *AbstractMonitor) UpdateLastStatus(status MonitorStatus) (old MonitorStatus) {
+	old = mon.lastStatus
+	mon.lastStatus = status
+
+	return
+}
+
+func (mon *AbstractMonitor) test() (bool, []error) { return false, nil }
+
+func getMs() int64 {
+	return time.Now().UnixNano() / int64(time.Millisecond)
+}