- customisable time format
- custom messages - configure threshold count instead of uptime %
This commit is contained in:
2
api.go
2
api.go
@@ -37,7 +37,7 @@ func (api CachetAPI) Ping() error {
|
|||||||
|
|
||||||
// SendMetric adds a data point to a cachet monitor
|
// SendMetric adds a data point to a cachet monitor
|
||||||
func (api CachetAPI) SendMetric(id int, lag int64) {
|
func (api CachetAPI) SendMetric(id int, lag int64) {
|
||||||
logrus.Debugf("Sending lag metric ID:%d %vms", id, lag)
|
logrus.Debugf("Sending lag metric ID:%d RTT %vms", id, lag)
|
||||||
|
|
||||||
jsonBytes, _ := json.Marshal(map[string]interface{}{
|
jsonBytes, _ := json.Marshal(map[string]interface{}{
|
||||||
"value": lag,
|
"value": lag,
|
||||||
|
|||||||
20
config.go
20
config.go
@@ -10,13 +10,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type CachetMonitor struct {
|
type CachetMonitor struct {
|
||||||
SystemName string `json:"system_name"`
|
SystemName string `json:"system_name" yaml:"system_name"`
|
||||||
|
DateFormat string `json:"date_format" yaml:"date_format"`
|
||||||
API CachetAPI `json:"api"`
|
API CachetAPI `json:"api"`
|
||||||
RawMonitors []map[string]interface{} `json:"monitors" yaml:"monitors"`
|
RawMonitors []map[string]interface{} `json:"monitors" yaml:"monitors"`
|
||||||
|
|
||||||
Monitors []MonitorInterface `json:"-" yaml:"-"`
|
Monitors []MonitorInterface `json:"-" yaml:"-"`
|
||||||
|
Immediate bool `json:"-" yaml:"-"`
|
||||||
Immediate bool `json:"-" yaml:"-"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate configuration
|
// Validate configuration
|
||||||
@@ -28,6 +28,10 @@ func (cfg *CachetMonitor) Validate() bool {
|
|||||||
cfg.SystemName = getHostname()
|
cfg.SystemName = getHostname()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(cfg.DateFormat) == 0 {
|
||||||
|
cfg.DateFormat = DefaultTimeFormat
|
||||||
|
}
|
||||||
|
|
||||||
if len(cfg.API.Token) == 0 || len(cfg.API.URL) == 0 {
|
if len(cfg.API.Token) == 0 || len(cfg.API.URL) == 0 {
|
||||||
logrus.Warnf("API URL or API Token missing.\nGet help at https://github.com/castawaylabs/cachet-monitor")
|
logrus.Warnf("API URL or API Token missing.\nGet help at https://github.com/castawaylabs/cachet-monitor")
|
||||||
valid = false
|
valid = false
|
||||||
@@ -74,3 +78,11 @@ func GetMonitorType(t string) string {
|
|||||||
|
|
||||||
return t
|
return t
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getTemplateData(monitor *AbstractMonitor) map[string]interface{} {
|
||||||
|
return map[string]interface{}{
|
||||||
|
"SystemName": monitor.config.SystemName,
|
||||||
|
"API": monitor.config.API,
|
||||||
|
"Monitor": monitor,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
5
http.go
5
http.go
@@ -8,11 +8,13 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/Sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Investigating template
|
// Investigating template
|
||||||
var defaultHTTPInvestigatingTpl = MessageTemplate{
|
var defaultHTTPInvestigatingTpl = MessageTemplate{
|
||||||
Subject: `{{ .Name }} - {{ .config.SystemName }}`,
|
Subject: `{{ .Monitor.Name }} - {{ .SystemName }}`,
|
||||||
Message: `{{ .Name }} check **failed** - {{ .now }}
|
Message: `{{ .Name }} check **failed** - {{ .now }}
|
||||||
|
|
||||||
{{ .lastFailReason }}`,
|
{{ .lastFailReason }}`,
|
||||||
@@ -90,6 +92,7 @@ func (mon *HTTPMonitor) Validate() []string {
|
|||||||
mon.Template.Fixed.SetDefault(defaultHTTPFixedTpl)
|
mon.Template.Fixed.SetDefault(defaultHTTPFixedTpl)
|
||||||
|
|
||||||
errs := mon.AbstractMonitor.Validate()
|
errs := mon.AbstractMonitor.Validate()
|
||||||
|
logrus.Warnf("%#v", mon.Template.Investigating)
|
||||||
|
|
||||||
if len(mon.ExpectedBody) > 0 {
|
if len(mon.ExpectedBody) > 0 {
|
||||||
exp, err := regexp.Compile(mon.ExpectedBody)
|
exp, err := regexp.Compile(mon.ExpectedBody)
|
||||||
|
|||||||
79
monitor.go
79
monitor.go
@@ -44,8 +44,9 @@ type AbstractMonitor struct {
|
|||||||
Fixed MessageTemplate
|
Fixed MessageTemplate
|
||||||
}
|
}
|
||||||
|
|
||||||
// Threshold = percentage
|
// Threshold = percentage / number of down incidents
|
||||||
Threshold float32
|
Threshold float32
|
||||||
|
ThresholdCount bool `mapstructure:"threshold_count"`
|
||||||
|
|
||||||
history []bool
|
history []bool
|
||||||
lastFailReason string
|
lastFailReason string
|
||||||
@@ -83,7 +84,10 @@ func (mon *AbstractMonitor) Validate() []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if err := mon.Template.Fixed.Compile(); err != nil {
|
if err := mon.Template.Fixed.Compile(); err != nil {
|
||||||
errs = append(errs, "Could not compile template: "+err.Error())
|
errs = append(errs, "Could not compile \"fixed\" template: "+err.Error())
|
||||||
|
}
|
||||||
|
if err := mon.Template.Investigating.Compile(); err != nil {
|
||||||
|
errs = append(errs, "Could not compile \"investigating\" template: "+err.Error())
|
||||||
}
|
}
|
||||||
|
|
||||||
return errs
|
return errs
|
||||||
@@ -137,11 +141,16 @@ func (mon *AbstractMonitor) tick(iface MonitorInterface) {
|
|||||||
up := iface.test()
|
up := iface.test()
|
||||||
lag := getMs() - reqStart
|
lag := getMs() - reqStart
|
||||||
|
|
||||||
if len(mon.history) == HistorySize-1 {
|
histSize := HistorySize
|
||||||
|
if mon.ThresholdCount {
|
||||||
|
histSize = int(mon.Threshold)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(mon.history) == histSize-1 {
|
||||||
logrus.Warnf("%v is now saturated\n", mon.Name)
|
logrus.Warnf("%v is now saturated\n", mon.Name)
|
||||||
}
|
}
|
||||||
if len(mon.history) >= HistorySize {
|
if len(mon.history) >= histSize {
|
||||||
mon.history = mon.history[len(mon.history)-(HistorySize-1):]
|
mon.history = mon.history[len(mon.history)-(histSize-1):]
|
||||||
}
|
}
|
||||||
mon.history = append(mon.history, up)
|
mon.history = append(mon.history, up)
|
||||||
mon.AnalyseData()
|
mon.AnalyseData()
|
||||||
@@ -163,43 +172,61 @@ func (monitor *AbstractMonitor) AnalyseData() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
t := (float32(numDown) / float32(len(monitor.history))) * 100
|
t := (float32(numDown) / float32(len(monitor.history))) * 100
|
||||||
logrus.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second))
|
if monitor.ThresholdCount {
|
||||||
|
logrus.Printf("%s %d/%d down at %v", monitor.Name, numDown, int(monitor.Threshold), time.Now().Format(DefaultTimeFormat))
|
||||||
|
} else {
|
||||||
|
logrus.Printf("%s %.2f%%/%.2f%% down at %v", monitor.Name, t, monitor.Threshold, time.Now().Format(DefaultTimeFormat))
|
||||||
|
}
|
||||||
|
|
||||||
if len(monitor.history) != HistorySize {
|
histSize := HistorySize
|
||||||
|
if monitor.ThresholdCount {
|
||||||
|
histSize = int(monitor.Threshold)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(monitor.history) != histSize {
|
||||||
// not saturated
|
// not saturated
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if t > monitor.Threshold && monitor.incident == nil {
|
triggered := (monitor.ThresholdCount && numDown == int(monitor.Threshold)) || (!monitor.ThresholdCount && t > monitor.Threshold)
|
||||||
|
|
||||||
|
if triggered && monitor.incident == nil {
|
||||||
|
// create incident
|
||||||
|
subject, message := monitor.Template.Investigating.Exec(getTemplateData(monitor))
|
||||||
monitor.incident = &Incident{
|
monitor.incident = &Incident{
|
||||||
Name: monitor.Name + " - " + monitor.config.SystemName,
|
Name: subject,
|
||||||
ComponentID: monitor.ComponentID,
|
ComponentID: monitor.ComponentID,
|
||||||
Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat),
|
Message: message,
|
||||||
Notify: true,
|
Notify: true,
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(monitor.lastFailReason) > 0 {
|
|
||||||
monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`"
|
|
||||||
}
|
|
||||||
|
|
||||||
// is down, create an incident
|
// is down, create an incident
|
||||||
logrus.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
|
logrus.Warnf("%v: creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
|
||||||
// set investigating status
|
// set investigating status
|
||||||
monitor.incident.SetInvestigating()
|
monitor.incident.SetInvestigating()
|
||||||
// create/update incident
|
// create/update incident
|
||||||
if err := monitor.incident.Send(monitor.config); err != nil {
|
if err := monitor.incident.Send(monitor.config); err != nil {
|
||||||
logrus.Printf("Error sending incident: %v\n", err)
|
logrus.Printf("Error sending incident: %v\n", err)
|
||||||
}
|
}
|
||||||
} else if t < monitor.Threshold && monitor.incident != nil {
|
|
||||||
// was down, created an incident, its now ok, make it resolved.
|
|
||||||
logrus.Printf("%v resolved downtime incident", monitor.Name)
|
|
||||||
|
|
||||||
// resolve incident
|
return
|
||||||
monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
|
|
||||||
monitor.incident.SetFixed()
|
|
||||||
monitor.incident.Send(monitor.config)
|
|
||||||
|
|
||||||
monitor.lastFailReason = ""
|
|
||||||
monitor.incident = nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// still triggered or no incident
|
||||||
|
if triggered || monitor.incident == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
logrus.Warnf("Resolving incident")
|
||||||
|
|
||||||
|
// was down, created an incident, its now ok, make it resolved.
|
||||||
|
logrus.Printf("%v resolved downtime incident", monitor.Name)
|
||||||
|
|
||||||
|
// resolve incident
|
||||||
|
monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
|
||||||
|
monitor.incident.SetFixed()
|
||||||
|
monitor.incident.Send(monitor.config)
|
||||||
|
|
||||||
|
monitor.lastFailReason = ""
|
||||||
|
monitor.incident = nil
|
||||||
}
|
}
|
||||||
|
|||||||
17
template.go
17
template.go
@@ -1,6 +1,11 @@
|
|||||||
package cachet
|
package cachet
|
||||||
|
|
||||||
import "text/template"
|
import (
|
||||||
|
"bytes"
|
||||||
|
"text/template"
|
||||||
|
|
||||||
|
"github.com/Sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
type MessageTemplate struct {
|
type MessageTemplate struct {
|
||||||
Subject string `json:"subject"`
|
Subject string `json:"subject"`
|
||||||
@@ -33,6 +38,16 @@ func (t *MessageTemplate) Compile() error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *MessageTemplate) Exec(data interface{}) (string, string) {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
|
||||||
|
logrus.Warnf("%#v", t.subjectTpl)
|
||||||
|
t.subjectTpl.Execute(buf, data)
|
||||||
|
subject := buf.String()
|
||||||
|
|
||||||
|
return subject, ""
|
||||||
|
}
|
||||||
|
|
||||||
func compileTemplate(text string) (*template.Template, error) {
|
func compileTemplate(text string) (*template.Template, error) {
|
||||||
return template.New("").Parse(text)
|
return template.New("").Parse(text)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user