183 lines
4.1 KiB
Go
183 lines
4.1 KiB
Go
package cachet
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
)
|
|
|
|
const DefaultInterval = time.Second * 60
|
|
const DefaultTimeout = time.Second
|
|
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
|
|
const HistorySize = 10
|
|
|
|
type MonitorInterface interface {
|
|
do() bool
|
|
Validate() []string
|
|
GetMonitor() *AbstractMonitor
|
|
Describe() []string
|
|
}
|
|
|
|
// AbstractMonitor data model
|
|
type AbstractMonitor struct {
|
|
Name string
|
|
Target string
|
|
|
|
// (default)http, tcp, dns, icmp
|
|
Type string
|
|
|
|
// defaults true
|
|
Strict bool
|
|
|
|
Interval time.Duration
|
|
Timeout time.Duration
|
|
|
|
MetricID int `mapstructure:"metric_id"`
|
|
ComponentID int `mapstructure:"component_id"`
|
|
|
|
// Templating stuff
|
|
Template struct {
|
|
Investigating MessageTemplate
|
|
Fixed MessageTemplate
|
|
}
|
|
|
|
// Threshold = percentage
|
|
Threshold float32
|
|
|
|
history []bool
|
|
lastFailReason string
|
|
incident *Incident
|
|
config *CachetMonitor
|
|
|
|
// Closed when mon.Stop() is called
|
|
stopC chan bool
|
|
}
|
|
|
|
func (mon *AbstractMonitor) do() bool {
|
|
return true
|
|
}
|
|
func (mon *AbstractMonitor) Validate() []string {
|
|
return []string{}
|
|
}
|
|
func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor {
|
|
return mon
|
|
}
|
|
func (mon AbstractMonitor) Describe() []string {
|
|
features := []string{"Type: " + mon.Type}
|
|
|
|
if len(mon.Name) > 0 {
|
|
features = append(features, "Name: "+mon.Name)
|
|
}
|
|
|
|
return features
|
|
}
|
|
|
|
func (mon *AbstractMonitor) Start(cfg *CachetMonitor, wg *sync.WaitGroup) {
|
|
wg.Add(1)
|
|
mon.config = cfg
|
|
mon.stopC = make(chan bool)
|
|
mon.Tick()
|
|
|
|
ticker := time.NewTicker(mon.Interval * time.Second)
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
mon.Tick()
|
|
case <-mon.stopC:
|
|
wg.Done()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (monitor *AbstractMonitor) Stop() {
|
|
if monitor.Stopped() {
|
|
return
|
|
}
|
|
|
|
close(monitor.stopC)
|
|
}
|
|
|
|
func (monitor *AbstractMonitor) Stopped() bool {
|
|
select {
|
|
case <-monitor.stopC:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (monitor *AbstractMonitor) Tick() {
|
|
reqStart := getMs()
|
|
up := monitor.do()
|
|
lag := getMs() - reqStart
|
|
|
|
if len(monitor.history) == HistorySize-1 {
|
|
logrus.Warnf("%v is now saturated\n", monitor.Name)
|
|
}
|
|
if len(monitor.history) >= HistorySize {
|
|
monitor.history = monitor.history[len(monitor.history)-(HistorySize-1):]
|
|
}
|
|
monitor.history = append(monitor.history, up)
|
|
monitor.AnalyseData()
|
|
|
|
// report lag
|
|
if up && monitor.MetricID > 0 {
|
|
logrus.Infof("%v", lag)
|
|
// monitor.SendMetric(lag)
|
|
}
|
|
}
|
|
|
|
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
|
|
func (monitor *AbstractMonitor) AnalyseData() {
|
|
// look at the past few incidents
|
|
numDown := 0
|
|
for _, wasUp := range monitor.history {
|
|
if wasUp == false {
|
|
numDown++
|
|
}
|
|
}
|
|
|
|
t := (float32(numDown) / float32(len(monitor.history))) * 100
|
|
logrus.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second))
|
|
|
|
if len(monitor.history) != 10 {
|
|
// not saturated
|
|
return
|
|
}
|
|
|
|
if t > monitor.Threshold && monitor.incident == nil {
|
|
monitor.incident = &Incident{
|
|
Name: monitor.Name + " - " + monitor.config.SystemName,
|
|
ComponentID: monitor.ComponentID,
|
|
Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat),
|
|
Notify: true,
|
|
}
|
|
|
|
if len(monitor.lastFailReason) > 0 {
|
|
monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`"
|
|
}
|
|
|
|
// is down, create an incident
|
|
logrus.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
|
|
// set investigating status
|
|
monitor.incident.SetInvestigating()
|
|
// create/update incident
|
|
if err := monitor.incident.Send(monitor.config); err != nil {
|
|
logrus.Printf("Error sending incident: %v\n", err)
|
|
}
|
|
} else if t < monitor.Threshold && monitor.incident != nil {
|
|
// was down, created an incident, its now ok, make it resolved.
|
|
logrus.Printf("%v resolved downtime incident", monitor.Name)
|
|
|
|
// resolve incident
|
|
monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
|
|
monitor.incident.SetFixed()
|
|
monitor.incident.Send(monitor.config)
|
|
|
|
monitor.lastFailReason = ""
|
|
monitor.incident = nil
|
|
}
|
|
}
|