Files
cachet-monitor/cachet/monitor.go
Matej Kramny 409d753931 Link to incident, resolved message
- Major outage when already in Partial outage
- Resolved at x message
- Link incident to component
2015-03-21 20:08:05 +00:00

116 lines
2.8 KiB
Go

package cachet
import (
"net/http"
"time"
)
const timeout = time.Duration(time.Second)
// Monitor data model
type Monitor struct {
Name string `json:"name"`
URL string `json:"url"`
MetricID int `json:"metric_id"`
Threshold float32 `json:"threshold"`
ComponentID *int `json:"component_id"`
ExpectedStatusCode int `json:"expected_status_code"`
History []bool `json:"-"`
LastFailReason *string `json:"-"`
Incident *Incident `json:"-"`
}
// Run loop
func (monitor *Monitor) Run() {
reqStart := getMs()
isUp := monitor.doRequest()
lag := getMs() - reqStart
if len(monitor.History) >= 10 {
monitor.History = monitor.History[len(monitor.History)-9:]
}
monitor.History = append(monitor.History, isUp)
monitor.AnalyseData()
if isUp == true && monitor.MetricID > 0 {
SendMetric(monitor.MetricID, lag)
}
}
func (monitor *Monitor) doRequest() bool {
client := &http.Client{
Timeout: timeout,
}
resp, err := client.Get(monitor.URL)
if err != nil {
errString := err.Error()
monitor.LastFailReason = &errString
return false
}
defer resp.Body.Close()
return resp.StatusCode == monitor.ExpectedStatusCode
}
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
func (monitor *Monitor) AnalyseData() {
// look at the past few incidents
numDown := 0
for _, wasUp := range monitor.History {
if wasUp == false {
numDown++
}
}
t := (float32(numDown) / float32(len(monitor.History))) * 100
Logger.Printf("%s %.2f%% Down at %v. Threshold: %.2f%%\n", monitor.URL, t, time.Now().UnixNano()/int64(time.Second), monitor.Threshold)
if len(monitor.History) != 10 {
// not enough data
return
}
if t > monitor.Threshold && monitor.Incident == nil {
// is down, create an incident
Logger.Println("Creating incident...")
monitor.Incident = &Incident{
Name: monitor.Name + " - " + Config.SystemName,
Message: monitor.Name + " failed",
ComponentID: monitor.ComponentID,
}
if monitor.LastFailReason != nil {
monitor.Incident.Message += "\n\n" + *monitor.LastFailReason
}
// set investigating status
monitor.Incident.SetInvestigating()
// lookup relevant incident
monitor.Incident.GetSimilarIncidentID()
// create/update incident
monitor.Incident.Send()
monitor.Incident.UpdateComponent()
} else if t < monitor.Threshold && monitor.Incident != nil {
// was down, created an incident, its now ok, make it resolved.
Logger.Println("Updating incident to resolved...")
// Add resolved message
monitor.Incident.Message += "\n\n-\n\nResolved at " + time.Now().String()
monitor.Incident.SetFixed()
monitor.Incident.Send()
monitor.Incident.UpdateComponent()
monitor.Incident = nil
}
}
func getMs() int64 {
return time.Now().UnixNano() / int64(time.Millisecond)
}