diff --git a/api.go b/api.go index a8da867..c6787f8 100644 --- a/api.go +++ b/api.go @@ -37,7 +37,7 @@ func (api CachetAPI) Ping() error { // SendMetric adds a data point to a cachet monitor func (api CachetAPI) SendMetric(id int, lag int64) { - logrus.Debugf("Sending lag metric ID:%d %vms", id, lag) + logrus.Debugf("Sending lag metric ID:%d RTT %vms", id, lag) jsonBytes, _ := json.Marshal(map[string]interface{}{ "value": lag, diff --git a/config.go b/config.go index 382c621..5d0e27e 100644 --- a/config.go +++ b/config.go @@ -10,13 +10,13 @@ import ( ) type CachetMonitor struct { - SystemName string `json:"system_name"` + SystemName string `json:"system_name" yaml:"system_name"` + DateFormat string `json:"date_format" yaml:"date_format"` API CachetAPI `json:"api"` RawMonitors []map[string]interface{} `json:"monitors" yaml:"monitors"` - Monitors []MonitorInterface `json:"-" yaml:"-"` - - Immediate bool `json:"-" yaml:"-"` + Monitors []MonitorInterface `json:"-" yaml:"-"` + Immediate bool `json:"-" yaml:"-"` } // Validate configuration @@ -28,6 +28,10 @@ func (cfg *CachetMonitor) Validate() bool { cfg.SystemName = getHostname() } + if len(cfg.DateFormat) == 0 { + cfg.DateFormat = DefaultTimeFormat + } + if len(cfg.API.Token) == 0 || len(cfg.API.URL) == 0 { logrus.Warnf("API URL or API Token missing.\nGet help at https://github.com/castawaylabs/cachet-monitor") valid = false @@ -74,3 +78,11 @@ func GetMonitorType(t string) string { return t } + +func getTemplateData(monitor *AbstractMonitor) map[string]interface{} { + return map[string]interface{}{ + "SystemName": monitor.config.SystemName, + "API": monitor.config.API, + "Monitor": monitor, + } +} diff --git a/http.go b/http.go index 746693b..eaa0ad3 100644 --- a/http.go +++ b/http.go @@ -8,11 +8,13 @@ import ( "strconv" "strings" "time" + + "github.com/Sirupsen/logrus" ) // Investigating template var defaultHTTPInvestigatingTpl = MessageTemplate{ - Subject: `{{ .Name }} - {{ .config.SystemName }}`, + Subject: `{{ .Monitor.Name }} - {{ .SystemName }}`, Message: `{{ .Name }} check **failed** - {{ .now }} {{ .lastFailReason }}`, @@ -90,6 +92,7 @@ func (mon *HTTPMonitor) Validate() []string { mon.Template.Fixed.SetDefault(defaultHTTPFixedTpl) errs := mon.AbstractMonitor.Validate() + logrus.Warnf("%#v", mon.Template.Investigating) if len(mon.ExpectedBody) > 0 { exp, err := regexp.Compile(mon.ExpectedBody) diff --git a/monitor.go b/monitor.go index f969302..ed7d724 100644 --- a/monitor.go +++ b/monitor.go @@ -44,8 +44,9 @@ type AbstractMonitor struct { Fixed MessageTemplate } - // Threshold = percentage - Threshold float32 + // Threshold = percentage / number of down incidents + Threshold float32 + ThresholdCount bool `mapstructure:"threshold_count"` history []bool lastFailReason string @@ -83,7 +84,10 @@ func (mon *AbstractMonitor) Validate() []string { } if err := mon.Template.Fixed.Compile(); err != nil { - errs = append(errs, "Could not compile template: "+err.Error()) + errs = append(errs, "Could not compile \"fixed\" template: "+err.Error()) + } + if err := mon.Template.Investigating.Compile(); err != nil { + errs = append(errs, "Could not compile \"investigating\" template: "+err.Error()) } return errs @@ -137,11 +141,16 @@ func (mon *AbstractMonitor) tick(iface MonitorInterface) { up := iface.test() lag := getMs() - reqStart - if len(mon.history) == HistorySize-1 { + histSize := HistorySize + if mon.ThresholdCount { + histSize = int(mon.Threshold) + } + + if len(mon.history) == histSize-1 { logrus.Warnf("%v is now saturated\n", mon.Name) } - if len(mon.history) >= HistorySize { - mon.history = mon.history[len(mon.history)-(HistorySize-1):] + if len(mon.history) >= histSize { + mon.history = mon.history[len(mon.history)-(histSize-1):] } mon.history = append(mon.history, up) mon.AnalyseData() @@ -163,43 +172,61 @@ func (monitor *AbstractMonitor) AnalyseData() { } t := (float32(numDown) / float32(len(monitor.history))) * 100 - logrus.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second)) + if monitor.ThresholdCount { + logrus.Printf("%s %d/%d down at %v", monitor.Name, numDown, int(monitor.Threshold), time.Now().Format(DefaultTimeFormat)) + } else { + logrus.Printf("%s %.2f%%/%.2f%% down at %v", monitor.Name, t, monitor.Threshold, time.Now().Format(DefaultTimeFormat)) + } - if len(monitor.history) != HistorySize { + histSize := HistorySize + if monitor.ThresholdCount { + histSize = int(monitor.Threshold) + } + + if len(monitor.history) != histSize { // not saturated return } - if t > monitor.Threshold && monitor.incident == nil { + triggered := (monitor.ThresholdCount && numDown == int(monitor.Threshold)) || (!monitor.ThresholdCount && t > monitor.Threshold) + + if triggered && monitor.incident == nil { + // create incident + subject, message := monitor.Template.Investigating.Exec(getTemplateData(monitor)) monitor.incident = &Incident{ - Name: monitor.Name + " - " + monitor.config.SystemName, + Name: subject, ComponentID: monitor.ComponentID, - Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat), + Message: message, Notify: true, } - if len(monitor.lastFailReason) > 0 { - monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`" - } - // is down, create an incident - logrus.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason) + logrus.Warnf("%v: creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason) // set investigating status monitor.incident.SetInvestigating() // create/update incident if err := monitor.incident.Send(monitor.config); err != nil { logrus.Printf("Error sending incident: %v\n", err) } - } else if t < monitor.Threshold && monitor.incident != nil { - // was down, created an incident, its now ok, make it resolved. - logrus.Printf("%v resolved downtime incident", monitor.Name) - // resolve incident - monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message - monitor.incident.SetFixed() - monitor.incident.Send(monitor.config) - - monitor.lastFailReason = "" - monitor.incident = nil + return } + + // still triggered or no incident + if triggered || monitor.incident == nil { + return + } + + logrus.Warnf("Resolving incident") + + // was down, created an incident, its now ok, make it resolved. + logrus.Printf("%v resolved downtime incident", monitor.Name) + + // resolve incident + monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message + monitor.incident.SetFixed() + monitor.incident.Send(monitor.config) + + monitor.lastFailReason = "" + monitor.incident = nil } diff --git a/template.go b/template.go index 25f5131..48f7666 100644 --- a/template.go +++ b/template.go @@ -1,6 +1,11 @@ package cachet -import "text/template" +import ( + "bytes" + "text/template" + + "github.com/Sirupsen/logrus" +) type MessageTemplate struct { Subject string `json:"subject"` @@ -33,6 +38,16 @@ func (t *MessageTemplate) Compile() error { return err } +func (t *MessageTemplate) Exec(data interface{}) (string, string) { + buf := new(bytes.Buffer) + + logrus.Warnf("%#v", t.subjectTpl) + t.subjectTpl.Execute(buf, data) + subject := buf.String() + + return subject, "" +} + func compileTemplate(text string) (*template.Template, error) { return template.New("").Parse(text) }