Files
cachet-monitor/monitor.go
2017-02-04 22:15:24 -08:00

183 lines
4.1 KiB
Go

package cachet
import (
"sync"
"time"
"github.com/Sirupsen/logrus"
)
const DefaultInterval = time.Second * 60
const DefaultTimeout = time.Second
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
const HistorySize = 10
type MonitorInterface interface {
do() bool
Validate() []string
GetMonitor() *AbstractMonitor
Describe() []string
}
// AbstractMonitor data model
type AbstractMonitor struct {
Name string
Target string
// (default)http, tcp, dns, icmp
Type string
// defaults true
Strict bool
Interval time.Duration
Timeout time.Duration
MetricID int `mapstructure:"metric_id"`
ComponentID int `mapstructure:"component_id"`
// Templating stuff
Template struct {
Investigating MessageTemplate
Fixed MessageTemplate
}
// Threshold = percentage
Threshold float32
history []bool
lastFailReason string
incident *Incident
config *CachetMonitor
// Closed when mon.Stop() is called
stopC chan bool
}
func (mon *AbstractMonitor) do() bool {
return true
}
func (mon *AbstractMonitor) Validate() []string {
return []string{}
}
func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor {
return mon
}
func (mon AbstractMonitor) Describe() []string {
features := []string{"Type: " + mon.Type}
if len(mon.Name) > 0 {
features = append(features, "Name: "+mon.Name)
}
return features
}
func (mon *AbstractMonitor) Start(cfg *CachetMonitor, wg *sync.WaitGroup) {
wg.Add(1)
mon.config = cfg
mon.stopC = make(chan bool)
mon.Tick()
ticker := time.NewTicker(mon.Interval * time.Second)
for {
select {
case <-ticker.C:
mon.Tick()
case <-mon.stopC:
wg.Done()
return
}
}
}
func (monitor *AbstractMonitor) Stop() {
if monitor.Stopped() {
return
}
close(monitor.stopC)
}
func (monitor *AbstractMonitor) Stopped() bool {
select {
case <-monitor.stopC:
return true
default:
return false
}
}
func (monitor *AbstractMonitor) Tick() {
reqStart := getMs()
up := monitor.do()
lag := getMs() - reqStart
if len(monitor.history) == HistorySize-1 {
logrus.Warnf("%v is now saturated\n", monitor.Name)
}
if len(monitor.history) >= HistorySize {
monitor.history = monitor.history[len(monitor.history)-(HistorySize-1):]
}
monitor.history = append(monitor.history, up)
monitor.AnalyseData()
// report lag
if up && monitor.MetricID > 0 {
logrus.Infof("%v", lag)
// monitor.SendMetric(lag)
}
}
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
func (monitor *AbstractMonitor) AnalyseData() {
// look at the past few incidents
numDown := 0
for _, wasUp := range monitor.history {
if wasUp == false {
numDown++
}
}
t := (float32(numDown) / float32(len(monitor.history))) * 100
logrus.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second))
if len(monitor.history) != 10 {
// not saturated
return
}
if t > monitor.Threshold && monitor.incident == nil {
monitor.incident = &Incident{
Name: monitor.Name + " - " + monitor.config.SystemName,
ComponentID: monitor.ComponentID,
Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat),
Notify: true,
}
if len(monitor.lastFailReason) > 0 {
monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`"
}
// is down, create an incident
logrus.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
// set investigating status
monitor.incident.SetInvestigating()
// create/update incident
if err := monitor.incident.Send(monitor.config); err != nil {
logrus.Printf("Error sending incident: %v\n", err)
}
} else if t < monitor.Threshold && monitor.incident != nil {
// was down, created an incident, its now ok, make it resolved.
logrus.Printf("%v resolved downtime incident", monitor.Name)
// resolve incident
monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
monitor.incident.SetFixed()
monitor.incident.Send(monitor.config)
monitor.lastFailReason = ""
monitor.incident = nil
}
}