More config options, markdown in incidents
- Specify method to ping with - Body regex match - Markdown in incident messages - Update existing incidents
This commit is contained in:
20
cli/main.go
20
cli/main.go
@@ -10,6 +10,8 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"sync"
|
||||
|
||||
cachet "github.com/castawaylabs/cachet-monitor"
|
||||
)
|
||||
@@ -47,7 +49,23 @@ func main() {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
cfg.Run()
|
||||
cfg.Logger.Printf("System: %s\nAPI: %s\nMonitors: %d\n\n", cfg.SystemName, cfg.APIUrl, len(cfg.Monitors))
|
||||
|
||||
wg := &sync.WaitGroup{}
|
||||
for _, mon := range cfg.Monitors {
|
||||
go mon.Start(cfg, wg)
|
||||
}
|
||||
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, os.Interrupt, os.Kill)
|
||||
<-signals
|
||||
|
||||
cfg.Logger.Println("Abort: Waiting monitors to finish")
|
||||
for _, mon := range cfg.Monitors {
|
||||
mon.Stop()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func getLogger(logPath string) *log.Logger {
|
||||
|
||||
25
config.go
25
config.go
@@ -12,7 +12,6 @@ type CachetMonitor struct {
|
||||
|
||||
APIUrl string `json:"api_url"`
|
||||
APIToken string `json:"api_token"`
|
||||
Interval int64 `json:"interval"`
|
||||
SystemName string `json:"system_name"`
|
||||
LogPath string `json:"log_path"`
|
||||
InsecureAPI bool `json:"insecure_api"`
|
||||
@@ -20,28 +19,30 @@ type CachetMonitor struct {
|
||||
Monitors []*Monitor `json:"monitors"`
|
||||
}
|
||||
|
||||
func (mon *CachetMonitor) ValidateConfiguration() error {
|
||||
if mon.Logger == nil {
|
||||
mon.Logger = log.New(os.Stdout, "", log.Llongfile|log.Ldate|log.Ltime)
|
||||
func (cfg *CachetMonitor) ValidateConfiguration() error {
|
||||
if cfg.Logger == nil {
|
||||
cfg.Logger = log.New(os.Stdout, "", log.Llongfile|log.Ldate|log.Ltime)
|
||||
}
|
||||
|
||||
if len(mon.SystemName) == 0 {
|
||||
if len(cfg.SystemName) == 0 {
|
||||
// get hostname
|
||||
mon.SystemName = getHostname()
|
||||
cfg.SystemName = getHostname()
|
||||
}
|
||||
|
||||
if mon.Interval <= 0 {
|
||||
mon.Interval = 60
|
||||
}
|
||||
|
||||
if len(mon.APIToken) == 0 || len(mon.APIUrl) == 0 {
|
||||
if len(cfg.APIToken) == 0 || len(cfg.APIUrl) == 0 {
|
||||
return errors.New("API URL or API Token not set. cachet-monitor won't be able to report incidents.\n\nPlease set:\n CACHET_API and CACHET_TOKEN environment variable to override settings.\n\nGet help at https://github.com/castawaylabs/cachet-monitor\n")
|
||||
}
|
||||
|
||||
if len(mon.Monitors) == 0 {
|
||||
if len(cfg.Monitors) == 0 {
|
||||
return errors.New("No monitors defined!\nSee sample configuration: https://github.com/castawaylabs/cachet-monitor/blob/master/example.config.json\n")
|
||||
}
|
||||
|
||||
for _, monitor := range cfg.Monitors {
|
||||
if err := monitor.ValidateConfiguration(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
37
http.go
37
http.go
@@ -4,22 +4,13 @@ import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Component Cachet model
|
||||
type Component struct {
|
||||
ID json.Number `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
Status json.Number `json:"status_id"`
|
||||
HumanStatus string `json:"-"`
|
||||
IncidentCount int `json:"-"`
|
||||
CreatedAt *string `json:"created_at"`
|
||||
UpdatedAt *string `json:"updated_at"`
|
||||
}
|
||||
|
||||
func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBody []byte) (*http.Response, []byte, error) {
|
||||
req, err := http.NewRequest(requestType, monitor.APIUrl+url, bytes.NewBuffer(reqBody))
|
||||
|
||||
@@ -43,3 +34,25 @@ func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBod
|
||||
|
||||
return res, body, nil
|
||||
}
|
||||
|
||||
// SendMetric sends lag metric point
|
||||
func (monitor *CachetMonitor) SendMetric(metricID int, delay int64) error {
|
||||
if metricID <= 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
jsonBytes, _ := json.Marshal(&map[string]interface{}{
|
||||
"value": delay,
|
||||
})
|
||||
|
||||
resp, _, err := monitor.makeRequest("POST", "/metrics/"+strconv.Itoa(metricID)+"/points", jsonBytes)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return fmt.Errorf("Could not log data point!\n%v\n", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMs() int64 {
|
||||
return time.Now().UnixNano() / int64(time.Millisecond)
|
||||
}
|
||||
|
||||
157
incident.go
157
incident.go
@@ -2,156 +2,113 @@ package cachet
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// Incident Cachet data model
|
||||
type Incident struct {
|
||||
ID json.Number `json:"id"`
|
||||
ID int `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Message string `json:"message"`
|
||||
Status json.Number `json:"status"` // 4?
|
||||
HumanStatus string `json:"human_status"`
|
||||
Component *Component `json:"-"`
|
||||
ComponentID *json.Number `json:"component_id"`
|
||||
CreatedAt *string `json:"created_at"`
|
||||
UpdatedAt *string `json:"updated_at"`
|
||||
}
|
||||
Status int `json:"status"`
|
||||
Visible int `json"visible"`
|
||||
Notify bool `json:"notify"`
|
||||
|
||||
// GetIncidents - Get list of incidents
|
||||
func (monitor *CachetMonitor) GetIncidents() ([]Incident, error) {
|
||||
_, body, err := monitor.makeRequest("GET", "/incidents", nil)
|
||||
if err != nil {
|
||||
return []Incident{}, fmt.Errorf("Cannot get incidents: %v\n", err)
|
||||
}
|
||||
|
||||
var data struct {
|
||||
Incidents []Incident `json:"data"`
|
||||
}
|
||||
err = json.Unmarshal(body, &data)
|
||||
if err != nil {
|
||||
return []Incident{}, fmt.Errorf("Cannot parse incidents: %v\n", err)
|
||||
}
|
||||
|
||||
return data.Incidents, nil
|
||||
ComponentID int `json:"component_id"`
|
||||
ComponentStatus int `json:"component_status"`
|
||||
}
|
||||
|
||||
// Send - Create or Update incident
|
||||
func (monitor *CachetMonitor) SendIncident(incident *Incident) error {
|
||||
jsonBytes, _ := json.Marshal(map[string]interface{}{
|
||||
"name": incident.Name,
|
||||
"message": incident.Message,
|
||||
"status": incident.Status,
|
||||
"component_id": incident.ComponentID,
|
||||
"notify": true,
|
||||
})
|
||||
func (incident *Incident) Send(cfg *CachetMonitor) error {
|
||||
switch incident.Status {
|
||||
case 1, 2, 3:
|
||||
// partial outage
|
||||
incident.ComponentStatus = 3
|
||||
|
||||
componentStatus, err := incident.GetComponentStatus(cfg)
|
||||
if componentStatus == 3 {
|
||||
// major outage
|
||||
incident.ComponentStatus = 4
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cfg.Logger.Printf("cannot fetch component: %v", err)
|
||||
}
|
||||
case 4:
|
||||
// fixed
|
||||
incident.ComponentStatus = 1
|
||||
}
|
||||
|
||||
requestType := "POST"
|
||||
requestURL := "/incidents"
|
||||
if len(incident.ID) > 0 {
|
||||
if incident.ID > 0 {
|
||||
requestType = "PUT"
|
||||
requestURL += "/" + string(incident.ID)
|
||||
requestURL += "/" + strconv.Itoa(incident.ID)
|
||||
}
|
||||
|
||||
resp, body, err := monitor.makeRequest(requestType, requestURL, jsonBytes)
|
||||
jsonBytes, _ := json.Marshal(incident)
|
||||
|
||||
resp, body, err := cfg.makeRequest(requestType, requestURL, jsonBytes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var data struct {
|
||||
Incident Incident `json:"data"`
|
||||
Incident struct {
|
||||
ID int `json:"id"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &data); err != nil {
|
||||
return errors.New("Cannot parse incident body." + string(body))
|
||||
} else {
|
||||
return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body))
|
||||
}
|
||||
|
||||
incident.ID = data.Incident.ID
|
||||
incident.Component = data.Incident.Component
|
||||
if resp.StatusCode != 200 {
|
||||
return fmt.Errorf("Could not create/update incident!")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (incident *Incident) GetComponentStatus(cfg *CachetMonitor) (int, error) {
|
||||
resp, body, err := cfg.makeRequest("GET", "/components/"+strconv.Itoa(incident.ComponentID), nil)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return errors.New("Could not create/update incident!")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (monitor *CachetMonitor) fetchComponent(componentID string) (*Component, error) {
|
||||
_, body, err := monitor.makeRequest("GET", "/components/"+componentID, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return 0, fmt.Errorf("Invalid status code. Received %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var data struct {
|
||||
Component Component `json:"data"`
|
||||
Component struct {
|
||||
Status int `json:"status"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &data); err != nil {
|
||||
return nil, errors.New("Cannot parse component body. " + string(body))
|
||||
return 0, fmt.Errorf("Cannot parse component body: %v. Err = %v", string(body), err)
|
||||
}
|
||||
|
||||
return &data.Component, nil
|
||||
}
|
||||
|
||||
func (monitor *CachetMonitor) UpdateComponent(incident *Incident) error {
|
||||
if incident.ComponentID == nil || len(*incident.ComponentID) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if incident.Component == nil {
|
||||
// fetch component
|
||||
component, err := monitor.fetchComponent(string(*incident.ComponentID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("Cannot fetch component for incident. %v\n", err)
|
||||
}
|
||||
|
||||
incident.Component = component
|
||||
}
|
||||
|
||||
status, _ := strconv.Atoi(string(incident.Status))
|
||||
switch status {
|
||||
case 1, 2, 3:
|
||||
if incident.Component.Status == "3" {
|
||||
incident.Component.Status = "4"
|
||||
} else {
|
||||
incident.Component.Status = "3"
|
||||
}
|
||||
case 4:
|
||||
incident.Component.Status = "1"
|
||||
}
|
||||
|
||||
jsonBytes, _ := json.Marshal(map[string]interface{}{
|
||||
"status": incident.Component.Status,
|
||||
})
|
||||
|
||||
resp, _, err := monitor.makeRequest("PUT", "/components/"+string(incident.Component.ID), jsonBytes)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return fmt.Errorf("Could not update component: (resp code %d) %v", resp.StatusCode, err)
|
||||
}
|
||||
|
||||
return nil
|
||||
return data.Component.Status, nil
|
||||
}
|
||||
|
||||
// SetInvestigating sets status to Investigating
|
||||
func (incident *Incident) SetInvestigating() {
|
||||
incident.Status = "1"
|
||||
incident.HumanStatus = "Investigating"
|
||||
incident.Status = 1
|
||||
}
|
||||
|
||||
// SetIdentified sets status to Identified
|
||||
func (incident *Incident) SetIdentified() {
|
||||
incident.Status = "2"
|
||||
incident.HumanStatus = "Identified"
|
||||
incident.Status = 2
|
||||
}
|
||||
|
||||
// SetWatching sets status to Watching
|
||||
func (incident *Incident) SetWatching() {
|
||||
incident.Status = "3"
|
||||
incident.HumanStatus = "Watching"
|
||||
incident.Status = 3
|
||||
}
|
||||
|
||||
// SetFixed sets status to Fixed
|
||||
func (incident *Incident) SetFixed() {
|
||||
incident.Status = "4"
|
||||
incident.HumanStatus = "Fixed"
|
||||
incident.Status = 4
|
||||
}
|
||||
|
||||
25
metrics.go
25
metrics.go
@@ -1,25 +0,0 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// SendMetric sends lag metric point
|
||||
func (monitor *CachetMonitor) SendMetric(metricID int, delay int64) error {
|
||||
if metricID <= 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
jsonBytes, _ := json.Marshal(&map[string]interface{}{
|
||||
"value": delay,
|
||||
})
|
||||
|
||||
resp, _, err := monitor.makeRequest("POST", "/metrics/"+strconv.Itoa(metricID)+"/points", jsonBytes)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return fmt.Errorf("Could not log data point!\n%v\n", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
263
monitor.go
263
monitor.go
@@ -2,104 +2,82 @@ package cachet
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
const timeout = time.Duration(time.Second)
|
||||
const HttpTimeout = time.Duration(time.Second)
|
||||
const DefaultInterval = 60
|
||||
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
|
||||
|
||||
// Monitor data model
|
||||
type Monitor struct {
|
||||
Name string `json:"name"`
|
||||
URL string `json:"url"`
|
||||
MetricID int `json:"metric_id"`
|
||||
Threshold float32 `json:"threshold"`
|
||||
ComponentID *int `json:"component_id"`
|
||||
ExpectedStatusCode int `json:"expected_status_code"`
|
||||
StrictTLS *bool `json:"strict_tls"`
|
||||
Interval time.Duration `json:"interval"`
|
||||
Method string `json:"method"`
|
||||
StrictTLS bool `json:"strict_tls"`
|
||||
CheckInterval time.Duration `json:"interval"`
|
||||
|
||||
History []bool `json:"-"`
|
||||
LastFailReason *string `json:"-"`
|
||||
Incident *Incident `json:"-"`
|
||||
MetricID int `json:"metric_id"`
|
||||
ComponentID int `json:"component_id"`
|
||||
|
||||
// Threshold = percentage
|
||||
Threshold float32 `json:"threshold"`
|
||||
// Saturat
|
||||
ExpectedStatusCode int `json:"expected_status_code"`
|
||||
// compiled to Regexp
|
||||
ExpectedBody string `json:"expected_body"`
|
||||
bodyRegexp *regexp.Regexp
|
||||
|
||||
history []bool
|
||||
lastFailReason string
|
||||
incident *Incident
|
||||
config *CachetMonitor
|
||||
|
||||
// Closed when mon.Stop() is called
|
||||
stopC chan bool
|
||||
}
|
||||
|
||||
func (cfg *CachetMonitor) Run() {
|
||||
cfg.Logger.Printf("System: %s\nInterval: %d second(s)\nAPI: %s\n\n", cfg.SystemName, cfg.Interval, cfg.APIUrl)
|
||||
cfg.Logger.Printf("Starting %d monitors:\n", len(cfg.Monitors))
|
||||
for _, mon := range cfg.Monitors {
|
||||
cfg.Logger.Printf(" %s: GET %s & Expect HTTP %d\n", mon.Name, mon.URL, mon.ExpectedStatusCode)
|
||||
if mon.MetricID > 0 {
|
||||
cfg.Logger.Printf(" - Logs lag to metric id: %d\n", mon.MetricID)
|
||||
}
|
||||
if mon.ComponentID != nil && *mon.ComponentID > 0 {
|
||||
cfg.Logger.Printf(" - Updates component id: %d\n", *mon.ComponentID)
|
||||
}
|
||||
}
|
||||
|
||||
cfg.Logger.Println()
|
||||
wg := &sync.WaitGroup{}
|
||||
|
||||
for _, mon := range cfg.Monitors {
|
||||
func (mon *Monitor) Start(cfg *CachetMonitor, wg *sync.WaitGroup) {
|
||||
wg.Add(1)
|
||||
mon.config = cfg
|
||||
mon.stopC = make(chan bool)
|
||||
|
||||
go func(mon *Monitor) {
|
||||
if mon.Interval < 1 {
|
||||
mon.Interval = time.Duration(cfg.Interval)
|
||||
mon.config.Logger.Printf(" Starting %s: %d seconds check interval\n - %v %s", mon.Name, mon.CheckInterval, mon.Method, mon.URL)
|
||||
|
||||
// print features
|
||||
if mon.ExpectedStatusCode > 0 {
|
||||
mon.config.Logger.Printf(" - Expect HTTP %d", mon.ExpectedStatusCode)
|
||||
}
|
||||
if len(mon.ExpectedBody) > 0 {
|
||||
mon.config.Logger.Printf(" - Expect Body to match \"%v\"", mon.ExpectedBody)
|
||||
}
|
||||
if mon.MetricID > 0 {
|
||||
mon.config.Logger.Printf(" - Log lag to metric id %d\n", mon.MetricID)
|
||||
}
|
||||
if mon.ComponentID > 0 {
|
||||
mon.config.Logger.Printf(" - Update component id %d\n\n", mon.ComponentID)
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(mon.Interval * time.Second)
|
||||
mon.Tick()
|
||||
|
||||
ticker := time.NewTicker(mon.CheckInterval * time.Second)
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
mon.Run()
|
||||
case <-mon.StopC():
|
||||
mon.Tick()
|
||||
case <-mon.stopC:
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
}
|
||||
}(mon)
|
||||
}
|
||||
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, os.Interrupt, os.Kill)
|
||||
<-signals
|
||||
|
||||
log.Println("Waiting monitors to end current operation")
|
||||
for _, mon := range cfg.Monitors {
|
||||
mon.Stop()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// Run loop
|
||||
func (monitor *Monitor) Run() {
|
||||
reqStart := getMs()
|
||||
isUp := monitor.doRequest()
|
||||
lag := getMs() - reqStart
|
||||
|
||||
if len(monitor.History) >= 10 {
|
||||
monitor.History = monitor.History[len(monitor.History)-9:]
|
||||
}
|
||||
monitor.History = append(monitor.History, isUp)
|
||||
monitor.AnalyseData()
|
||||
|
||||
if isUp == true && monitor.MetricID > 0 {
|
||||
monitor.config.SendMetric(monitor.MetricID, lag)
|
||||
}
|
||||
}
|
||||
|
||||
func (monitor *Monitor) Stop() {
|
||||
@@ -110,10 +88,6 @@ func (monitor *Monitor) Stop() {
|
||||
close(monitor.stopC)
|
||||
}
|
||||
|
||||
func (monitor *Monitor) StopC() <-chan bool {
|
||||
return monitor.stopC
|
||||
}
|
||||
|
||||
func (monitor *Monitor) Stopped() bool {
|
||||
select {
|
||||
case <-monitor.stopC:
|
||||
@@ -123,11 +97,30 @@ func (monitor *Monitor) Stopped() bool {
|
||||
}
|
||||
}
|
||||
|
||||
func (monitor *Monitor) Tick() {
|
||||
reqStart := getMs()
|
||||
isUp := monitor.doRequest()
|
||||
lag := getMs() - reqStart
|
||||
|
||||
if len(monitor.history) == 9 {
|
||||
monitor.config.Logger.Printf("%v is now saturated\n", monitor.Name)
|
||||
}
|
||||
if len(monitor.history) >= 10 {
|
||||
monitor.history = monitor.history[len(monitor.history)-9:]
|
||||
}
|
||||
monitor.history = append(monitor.history, isUp)
|
||||
monitor.AnalyseData()
|
||||
|
||||
if isUp == true && monitor.MetricID > 0 {
|
||||
monitor.config.SendMetric(monitor.MetricID, lag)
|
||||
}
|
||||
}
|
||||
|
||||
func (monitor *Monitor) doRequest() bool {
|
||||
client := &http.Client{
|
||||
Timeout: timeout,
|
||||
Timeout: HttpTimeout,
|
||||
}
|
||||
if monitor.StrictTLS != nil && *monitor.StrictTLS == false {
|
||||
if monitor.StrictTLS == false {
|
||||
client.Transport = &http.Transport{
|
||||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||
}
|
||||
@@ -135,19 +128,36 @@ func (monitor *Monitor) doRequest() bool {
|
||||
|
||||
resp, err := client.Get(monitor.URL)
|
||||
if err != nil {
|
||||
errString := err.Error()
|
||||
monitor.LastFailReason = &errString
|
||||
monitor.lastFailReason = err.Error()
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != monitor.ExpectedStatusCode {
|
||||
failReason := "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode)
|
||||
monitor.LastFailReason = &failReason
|
||||
if monitor.ExpectedStatusCode > 0 && resp.StatusCode != monitor.ExpectedStatusCode {
|
||||
monitor.lastFailReason = "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode)
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
if monitor.bodyRegexp != nil {
|
||||
// check body
|
||||
responseBody, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
monitor.lastFailReason = err.Error()
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
match := monitor.bodyRegexp.Match(responseBody)
|
||||
if !match {
|
||||
monitor.lastFailReason = "Unexpected body: " + string(responseBody) + ". Expected to match " + monitor.ExpectedBody
|
||||
}
|
||||
|
||||
return match
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -155,60 +165,89 @@ func (monitor *Monitor) doRequest() bool {
|
||||
func (monitor *Monitor) AnalyseData() {
|
||||
// look at the past few incidents
|
||||
numDown := 0
|
||||
for _, wasUp := range monitor.History {
|
||||
for _, wasUp := range monitor.history {
|
||||
if wasUp == false {
|
||||
numDown++
|
||||
}
|
||||
}
|
||||
|
||||
t := (float32(numDown) / float32(len(monitor.History))) * 100
|
||||
monitor.config.Logger.Printf("%s %.2f%% Down at %v. Threshold: %.2f%%\n", monitor.URL, t, time.Now().UnixNano()/int64(time.Second), monitor.Threshold)
|
||||
t := (float32(numDown) / float32(len(monitor.history))) * 100
|
||||
monitor.config.Logger.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second))
|
||||
|
||||
if len(monitor.History) != 10 {
|
||||
// not enough data
|
||||
if len(monitor.history) != 10 {
|
||||
// not saturated
|
||||
return
|
||||
}
|
||||
|
||||
if t > monitor.Threshold && monitor.Incident == nil {
|
||||
// is down, create an incident
|
||||
monitor.config.Logger.Println("Creating incident...")
|
||||
|
||||
component_id := json.Number(strconv.Itoa(*monitor.ComponentID))
|
||||
monitor.Incident = &Incident{
|
||||
if t > monitor.Threshold && monitor.incident == nil {
|
||||
monitor.incident = &Incident{
|
||||
Name: monitor.Name + " - " + monitor.config.SystemName,
|
||||
Message: monitor.Name + " check failed",
|
||||
ComponentID: &component_id,
|
||||
ComponentID: monitor.ComponentID,
|
||||
Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat),
|
||||
Notify: true,
|
||||
}
|
||||
|
||||
if monitor.LastFailReason != nil {
|
||||
monitor.Incident.Message += "\n\n - " + *monitor.LastFailReason
|
||||
if len(monitor.lastFailReason) > 0 {
|
||||
monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`"
|
||||
}
|
||||
|
||||
// is down, create an incident
|
||||
monitor.config.Logger.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
|
||||
// set investigating status
|
||||
monitor.Incident.SetInvestigating()
|
||||
|
||||
monitor.incident.SetInvestigating()
|
||||
// create/update incident
|
||||
monitor.config.SendIncident(monitor.Incident)
|
||||
monitor.config.UpdateComponent(monitor.Incident)
|
||||
} else if t < monitor.Threshold && monitor.Incident != nil {
|
||||
if err := monitor.incident.Send(monitor.config); err != nil {
|
||||
monitor.config.Logger.Printf("Error sending incident: %v\n", err)
|
||||
}
|
||||
} else if t < monitor.Threshold && monitor.incident != nil {
|
||||
// was down, created an incident, its now ok, make it resolved.
|
||||
monitor.config.Logger.Println("Updating incident to resolved...")
|
||||
monitor.config.Logger.Printf("%v resolved downtime incident", monitor.Name)
|
||||
|
||||
component_id := json.Number(strconv.Itoa(*monitor.ComponentID))
|
||||
monitor.Incident = &Incident{
|
||||
Name: monitor.Incident.Name,
|
||||
Message: monitor.Name + " check succeeded",
|
||||
ComponentID: &component_id,
|
||||
}
|
||||
// resolve incident
|
||||
monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
|
||||
monitor.incident.SetFixed()
|
||||
monitor.incident.Send(monitor.config)
|
||||
|
||||
monitor.Incident.SetFixed()
|
||||
monitor.config.SendIncident(monitor.Incident)
|
||||
monitor.config.UpdateComponent(monitor.Incident)
|
||||
|
||||
monitor.Incident = nil
|
||||
monitor.lastFailReason = ""
|
||||
monitor.incident = nil
|
||||
}
|
||||
}
|
||||
|
||||
func getMs() int64 {
|
||||
return time.Now().UnixNano() / int64(time.Millisecond)
|
||||
func (monitor *Monitor) ValidateConfiguration() error {
|
||||
if len(monitor.ExpectedBody) > 0 {
|
||||
exp, err := regexp.Compile(monitor.ExpectedBody)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
monitor.bodyRegexp = exp
|
||||
}
|
||||
|
||||
if len(monitor.ExpectedBody) == 0 && monitor.ExpectedStatusCode == 0 {
|
||||
return errors.New("Nothing to check, both 'expected_body' and 'expected_status_code' fields empty")
|
||||
}
|
||||
|
||||
if monitor.CheckInterval < 1 {
|
||||
monitor.CheckInterval = DefaultInterval
|
||||
}
|
||||
|
||||
monitor.Method = strings.ToUpper(monitor.Method)
|
||||
switch monitor.Method {
|
||||
case "GET", "POST", "DELETE", "OPTIONS", "HEAD":
|
||||
break
|
||||
case "":
|
||||
monitor.Method = "GET"
|
||||
default:
|
||||
return fmt.Errorf("Unsupported check method: %v", monitor.Method)
|
||||
}
|
||||
|
||||
if monitor.ComponentID == 0 && monitor.MetricID == 0 {
|
||||
return errors.New("component_id & metric_id are unset")
|
||||
}
|
||||
|
||||
if monitor.Threshold <= 0 {
|
||||
monitor.Threshold = 100
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
54
readme.md
54
readme.md
@@ -4,9 +4,9 @@ Features
|
||||
--------
|
||||
|
||||
- [x] Creates & Resolves Incidents
|
||||
- [x] Posts monitor lag (interval configurable)
|
||||
- [x] Posts monitor lag to cachet graphs
|
||||
- [x] Updates Component to Partial Outage
|
||||
- [x] Updates Component to Major Outage if in Partial Outage
|
||||
- [x] Updates Component to Major Outage if already in Partial Outage (works well with distributed monitoring)
|
||||
- [x] Can be run on multiple servers and geo regions
|
||||
|
||||
Configuration
|
||||
@@ -14,36 +14,26 @@ Configuration
|
||||
|
||||
```
|
||||
{
|
||||
"api_url": "https://demo.cachethq.io/api/v1",
|
||||
"api_token": "<API TOKEN>",
|
||||
"interval": 60,
|
||||
"monitors": [
|
||||
{
|
||||
"name": "Name of your monitor",
|
||||
"url": "Ping URL",
|
||||
"metric_id": <metric id from cachet>,
|
||||
"component_id": <component id from cachet>,
|
||||
"threshold": 80,
|
||||
"expected_status_code": 200,
|
||||
"strict_tls": true,
|
||||
"interval": 5
|
||||
}
|
||||
],
|
||||
"insecure_api": false
|
||||
"api_url": "https://<cachet domain>/api/v1",
|
||||
"api_token": "<cachet api token>",
|
||||
"insecure_api": false, // optional, false default, set if your certificate is self-signed/untrusted
|
||||
"monitors": [{
|
||||
"name": "Name of your monitor", // required, friendly name for your monitor
|
||||
"url": "Ping URL", // required, url to probe
|
||||
"method": "get", // optional, http method (defaults GET)
|
||||
"strict_tls": true, // self-signed ssl certificate
|
||||
"interval": 10, // seconds between checks
|
||||
"metric_id": <metric id>, // post lag to cachet metric (graph)
|
||||
"component_id": <component id>, // post incidents to this component
|
||||
"threshold": 80, // If % of downtime is over this threshold, open an incident
|
||||
"expected_status_code": 200, // optional, expected status code (either status code or body must be supplied)
|
||||
"expected_body": "P.*NG" // optional, regular expression
|
||||
}],
|
||||
"system_name": "", // optional, system name to identify bot
|
||||
"log_path": "" // optional, defaults to stdout
|
||||
}
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
|
||||
- `metric_id` is optional
|
||||
- `insecure_api` if true it will ignore HTTPS certificate errors (eg if self-signed)
|
||||
- `strict_tls` if false (true is default) it will ignore HTTPS certificate errors (eg if monitor uses self-signed certificate)
|
||||
- `component_id` is optional
|
||||
- `threshold` is a percentage
|
||||
- `expected_status_code` is a http response code
|
||||
- `interval` is the duration in seconds between two checks.
|
||||
- GET request will be performed on the `url`
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
@@ -64,9 +54,9 @@ Environment variables
|
||||
---------------------
|
||||
|
||||
| Name | Example Value | Description |
|
||||
| ------------ | --------------------------- | --------------------------- |
|
||||
| CACHET_API | http://demo.cachethq.io/api | URL endpoint for cachet api |
|
||||
| CACHET_TOKEN | randomvalue | API Authentication token |
|
||||
| ------------ | ------------------------------ | --------------------------- |
|
||||
| CACHET_API | http://demo.cachethq.io/api/v1 | URL endpoint for cachet api |
|
||||
| CACHET_TOKEN | APIToken123 | API Authentication token |
|
||||
| CACHET_DEV | 1 | Strips logging |
|
||||
|
||||
Vision and goals
|
||||
|
||||
Reference in New Issue
Block a user