271 lines
5.9 KiB
Go
271 lines
5.9 KiB
Go
package cachet
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
const DefaultInterval = time.Second * 60
|
|
const DefaultTimeout = time.Second
|
|
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
|
|
const HistorySize = 10
|
|
|
|
type MonitorInterface interface {
|
|
ClockStart(*CachetMonitor, MonitorInterface, *sync.WaitGroup)
|
|
ClockStop()
|
|
tick(MonitorInterface)
|
|
test() bool
|
|
|
|
Validate() []string
|
|
GetMonitor() *AbstractMonitor
|
|
Describe() []string
|
|
}
|
|
|
|
// AbstractMonitor data model
|
|
type AbstractMonitor struct {
|
|
Name string
|
|
Target string
|
|
Active bool
|
|
|
|
// (default)http / dns
|
|
Type string
|
|
Strict bool
|
|
|
|
Interval time.Duration
|
|
Timeout time.Duration
|
|
|
|
MetricID int `mapstructure:"metric_id"`
|
|
ComponentID int `mapstructure:"component_id"`
|
|
|
|
// Templating stuff
|
|
Template struct {
|
|
Investigating MessageTemplate
|
|
Fixed MessageTemplate
|
|
}
|
|
|
|
// Threshold = percentage / number of down incidents
|
|
Threshold float32
|
|
ThresholdCount bool `mapstructure:"threshold_count"`
|
|
|
|
// lag / average(lagHistory) * 100 = percentage above average lag
|
|
// PerformanceThreshold sets the % limit above which this monitor will trigger degraded-performance
|
|
// PerformanceThreshold float32
|
|
|
|
history []bool
|
|
// lagHistory []float32
|
|
lastFailReason string
|
|
incident *Incident
|
|
config *CachetMonitor
|
|
|
|
// Closed when mon.Stop() is called
|
|
stopC chan bool
|
|
}
|
|
|
|
func (mon *AbstractMonitor) Validate() []string {
|
|
errs := []string{}
|
|
|
|
if len(mon.Name) == 0 {
|
|
errs = append(errs, "Name is required")
|
|
}
|
|
|
|
if mon.Interval < 1 {
|
|
mon.Interval = DefaultInterval
|
|
}
|
|
if mon.Timeout < 1 {
|
|
mon.Timeout = DefaultTimeout
|
|
}
|
|
|
|
if mon.Timeout > mon.Interval {
|
|
errs = append(errs, "Timeout greater than interval")
|
|
}
|
|
|
|
if mon.ComponentID == 0 && mon.MetricID == 0 {
|
|
errs = append(errs, "component_id & metric_id are unset")
|
|
}
|
|
|
|
if mon.Threshold <= 0 {
|
|
mon.Threshold = 100
|
|
}
|
|
|
|
if err := mon.Template.Fixed.Compile(); err != nil {
|
|
errs = append(errs, "Could not compile \"fixed\" template: "+err.Error())
|
|
}
|
|
if err := mon.Template.Investigating.Compile(); err != nil {
|
|
errs = append(errs, "Could not compile \"investigating\" template: "+err.Error())
|
|
}
|
|
|
|
return errs
|
|
}
|
|
func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor {
|
|
return mon
|
|
}
|
|
func (mon *AbstractMonitor) Describe() []string {
|
|
features := []string{"Type: " + mon.Type}
|
|
|
|
if len(mon.Name) > 0 {
|
|
features = append(features, "Name: "+mon.Name)
|
|
}
|
|
|
|
if !mon.Active {
|
|
features = append(features, "Active: No")
|
|
}
|
|
|
|
return features
|
|
}
|
|
|
|
func (mon *AbstractMonitor) ClockStart(cfg *CachetMonitor, iface MonitorInterface, wg *sync.WaitGroup) {
|
|
if !mon.Active {
|
|
return
|
|
};
|
|
wg.Add(1)
|
|
mon.config = cfg
|
|
mon.stopC = make(chan bool)
|
|
if cfg.Immediate {
|
|
mon.tick(iface)
|
|
}
|
|
|
|
if cfg.Restarted {
|
|
initialIncident, err := mon.Get(cfg)
|
|
if err != nil {
|
|
logrus.Warn("could not fetch initial incident: %v", err)
|
|
}
|
|
if initialIncident != nil {
|
|
mon.incident = initialIncident
|
|
}
|
|
}
|
|
|
|
ticker := time.NewTicker(mon.Interval * time.Second)
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
mon.tick(iface)
|
|
case <-mon.stopC:
|
|
wg.Done()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (mon *AbstractMonitor) ClockStop() {
|
|
select {
|
|
case <-mon.stopC:
|
|
return
|
|
default:
|
|
close(mon.stopC)
|
|
}
|
|
}
|
|
|
|
func (mon *AbstractMonitor) test() bool { return false }
|
|
|
|
func (mon *AbstractMonitor) tick(iface MonitorInterface) {
|
|
reqStart := getMs()
|
|
up := iface.test()
|
|
lag := getMs() - reqStart
|
|
|
|
histSize := HistorySize
|
|
if mon.ThresholdCount {
|
|
histSize = int(mon.Threshold)
|
|
}
|
|
|
|
if len(mon.history) == histSize-1 {
|
|
logrus.Warnf("%v is now saturated", mon.Name)
|
|
}
|
|
if len(mon.history) >= histSize {
|
|
mon.history = mon.history[len(mon.history)-(histSize-1):]
|
|
}
|
|
mon.history = append(mon.history, up)
|
|
mon.AnalyseData()
|
|
|
|
// report lag
|
|
if mon.MetricID > 0 {
|
|
go mon.config.API.SendMetric(mon.MetricID, lag)
|
|
}
|
|
}
|
|
|
|
// TODO: test
|
|
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
|
|
func (mon *AbstractMonitor) AnalyseData() {
|
|
// look at the past few incidents
|
|
numDown := 0
|
|
for _, wasUp := range mon.history {
|
|
if wasUp == false {
|
|
numDown++
|
|
}
|
|
}
|
|
|
|
t := (float32(numDown) / float32(len(mon.history))) * 100
|
|
l := logrus.WithFields(logrus.Fields{
|
|
"monitor": mon.Name,
|
|
"time": time.Now().Format(mon.config.DateFormat),
|
|
})
|
|
if numDown == 0 {
|
|
l.Printf("monitor is up")
|
|
} else if mon.ThresholdCount {
|
|
l.Printf("monitor down %d/%d", numDown, int(mon.Threshold))
|
|
} else {
|
|
l.Printf("monitor down %.2f%%/%.2f%%", t, mon.Threshold)
|
|
}
|
|
|
|
histSize := HistorySize
|
|
if mon.ThresholdCount {
|
|
histSize = int(mon.Threshold)
|
|
}
|
|
|
|
if len(mon.history) != histSize {
|
|
// not saturated
|
|
return
|
|
}
|
|
|
|
triggered := (mon.ThresholdCount && numDown == int(mon.Threshold)) || (!mon.ThresholdCount && t > mon.Threshold)
|
|
|
|
if triggered && mon.incident == nil {
|
|
// create incident
|
|
tplData := getTemplateData(mon)
|
|
tplData["FailReason"] = mon.lastFailReason
|
|
|
|
subject, message := mon.Template.Investigating.Exec(tplData)
|
|
mon.incident = &Incident{
|
|
Name: subject,
|
|
ComponentID: mon.ComponentID,
|
|
Message: message,
|
|
Notify: true,
|
|
}
|
|
|
|
// is down, create an incident
|
|
l.Warnf("creating incident. Monitor is down: %v", mon.lastFailReason)
|
|
// set investigating status
|
|
mon.incident.SetInvestigating()
|
|
// create/update incident
|
|
if err := mon.incident.Send(mon.config); err != nil {
|
|
l.Printf("Error sending incident: %v", err)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// still triggered or no incident
|
|
if triggered || mon.incident == nil {
|
|
return
|
|
}
|
|
|
|
// was down, created an incident, its now ok, make it resolved.
|
|
l.Warn("Resolving incident")
|
|
|
|
// resolve incident
|
|
tplData := getTemplateData(mon)
|
|
tplData["incident"] = mon.incident
|
|
|
|
subject, message := mon.Template.Fixed.Exec(tplData)
|
|
mon.incident.Name = subject
|
|
mon.incident.Message = message
|
|
mon.incident.SetFixed()
|
|
if err := mon.incident.Send(mon.config); err != nil {
|
|
l.Printf("Error sending incident: %v", err)
|
|
}
|
|
|
|
mon.lastFailReason = ""
|
|
mon.incident = nil
|
|
}
|