More config options, markdown in incidents

- Specify method to ping with
- Body regex match
- Markdown in incident messages
- Update existing incidents
This commit is contained in:
Matej Kramny
2016-05-19 18:40:01 +01:00
parent c729fbdf41
commit 0ceccccd45
7 changed files with 305 additions and 312 deletions

View File

@@ -10,6 +10,8 @@ import (
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
"os/signal"
"sync"
cachet "github.com/castawaylabs/cachet-monitor" cachet "github.com/castawaylabs/cachet-monitor"
) )
@@ -47,7 +49,23 @@ func main() {
panic(err) panic(err)
} }
cfg.Run() cfg.Logger.Printf("System: %s\nAPI: %s\nMonitors: %d\n\n", cfg.SystemName, cfg.APIUrl, len(cfg.Monitors))
wg := &sync.WaitGroup{}
for _, mon := range cfg.Monitors {
go mon.Start(cfg, wg)
}
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt, os.Kill)
<-signals
cfg.Logger.Println("Abort: Waiting monitors to finish")
for _, mon := range cfg.Monitors {
mon.Stop()
}
wg.Wait()
} }
func getLogger(logPath string) *log.Logger { func getLogger(logPath string) *log.Logger {

View File

@@ -12,7 +12,6 @@ type CachetMonitor struct {
APIUrl string `json:"api_url"` APIUrl string `json:"api_url"`
APIToken string `json:"api_token"` APIToken string `json:"api_token"`
Interval int64 `json:"interval"`
SystemName string `json:"system_name"` SystemName string `json:"system_name"`
LogPath string `json:"log_path"` LogPath string `json:"log_path"`
InsecureAPI bool `json:"insecure_api"` InsecureAPI bool `json:"insecure_api"`
@@ -20,28 +19,30 @@ type CachetMonitor struct {
Monitors []*Monitor `json:"monitors"` Monitors []*Monitor `json:"monitors"`
} }
func (mon *CachetMonitor) ValidateConfiguration() error { func (cfg *CachetMonitor) ValidateConfiguration() error {
if mon.Logger == nil { if cfg.Logger == nil {
mon.Logger = log.New(os.Stdout, "", log.Llongfile|log.Ldate|log.Ltime) cfg.Logger = log.New(os.Stdout, "", log.Llongfile|log.Ldate|log.Ltime)
} }
if len(mon.SystemName) == 0 { if len(cfg.SystemName) == 0 {
// get hostname // get hostname
mon.SystemName = getHostname() cfg.SystemName = getHostname()
} }
if mon.Interval <= 0 { if len(cfg.APIToken) == 0 || len(cfg.APIUrl) == 0 {
mon.Interval = 60
}
if len(mon.APIToken) == 0 || len(mon.APIUrl) == 0 {
return errors.New("API URL or API Token not set. cachet-monitor won't be able to report incidents.\n\nPlease set:\n CACHET_API and CACHET_TOKEN environment variable to override settings.\n\nGet help at https://github.com/castawaylabs/cachet-monitor\n") return errors.New("API URL or API Token not set. cachet-monitor won't be able to report incidents.\n\nPlease set:\n CACHET_API and CACHET_TOKEN environment variable to override settings.\n\nGet help at https://github.com/castawaylabs/cachet-monitor\n")
} }
if len(mon.Monitors) == 0 { if len(cfg.Monitors) == 0 {
return errors.New("No monitors defined!\nSee sample configuration: https://github.com/castawaylabs/cachet-monitor/blob/master/example.config.json\n") return errors.New("No monitors defined!\nSee sample configuration: https://github.com/castawaylabs/cachet-monitor/blob/master/example.config.json\n")
} }
for _, monitor := range cfg.Monitors {
if err := monitor.ValidateConfiguration(); err != nil {
return err
}
}
return nil return nil
} }

37
http.go
View File

@@ -4,22 +4,13 @@ import (
"bytes" "bytes"
"crypto/tls" "crypto/tls"
"encoding/json" "encoding/json"
"fmt"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"strconv"
"time"
) )
// Component Cachet model
type Component struct {
ID json.Number `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Status json.Number `json:"status_id"`
HumanStatus string `json:"-"`
IncidentCount int `json:"-"`
CreatedAt *string `json:"created_at"`
UpdatedAt *string `json:"updated_at"`
}
func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBody []byte) (*http.Response, []byte, error) { func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBody []byte) (*http.Response, []byte, error) {
req, err := http.NewRequest(requestType, monitor.APIUrl+url, bytes.NewBuffer(reqBody)) req, err := http.NewRequest(requestType, monitor.APIUrl+url, bytes.NewBuffer(reqBody))
@@ -43,3 +34,25 @@ func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBod
return res, body, nil return res, body, nil
} }
// SendMetric sends lag metric point
func (monitor *CachetMonitor) SendMetric(metricID int, delay int64) error {
if metricID <= 0 {
return nil
}
jsonBytes, _ := json.Marshal(&map[string]interface{}{
"value": delay,
})
resp, _, err := monitor.makeRequest("POST", "/metrics/"+strconv.Itoa(metricID)+"/points", jsonBytes)
if err != nil || resp.StatusCode != 200 {
return fmt.Errorf("Could not log data point!\n%v\n", err)
}
return nil
}
func getMs() int64 {
return time.Now().UnixNano() / int64(time.Millisecond)
}

View File

@@ -2,156 +2,113 @@ package cachet
import ( import (
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"strconv" "strconv"
) )
// Incident Cachet data model // Incident Cachet data model
type Incident struct { type Incident struct {
ID json.Number `json:"id"` ID int `json:"id"`
Name string `json:"name"` Name string `json:"name"`
Message string `json:"message"` Message string `json:"message"`
Status json.Number `json:"status"` // 4? Status int `json:"status"`
HumanStatus string `json:"human_status"` Visible int `json"visible"`
Component *Component `json:"-"` Notify bool `json:"notify"`
ComponentID *json.Number `json:"component_id"`
CreatedAt *string `json:"created_at"`
UpdatedAt *string `json:"updated_at"`
}
// GetIncidents - Get list of incidents ComponentID int `json:"component_id"`
func (monitor *CachetMonitor) GetIncidents() ([]Incident, error) { ComponentStatus int `json:"component_status"`
_, body, err := monitor.makeRequest("GET", "/incidents", nil)
if err != nil {
return []Incident{}, fmt.Errorf("Cannot get incidents: %v\n", err)
}
var data struct {
Incidents []Incident `json:"data"`
}
err = json.Unmarshal(body, &data)
if err != nil {
return []Incident{}, fmt.Errorf("Cannot parse incidents: %v\n", err)
}
return data.Incidents, nil
} }
// Send - Create or Update incident // Send - Create or Update incident
func (monitor *CachetMonitor) SendIncident(incident *Incident) error { func (incident *Incident) Send(cfg *CachetMonitor) error {
jsonBytes, _ := json.Marshal(map[string]interface{}{ switch incident.Status {
"name": incident.Name, case 1, 2, 3:
"message": incident.Message, // partial outage
"status": incident.Status, incident.ComponentStatus = 3
"component_id": incident.ComponentID,
"notify": true, componentStatus, err := incident.GetComponentStatus(cfg)
}) if componentStatus == 3 {
// major outage
incident.ComponentStatus = 4
}
if err != nil {
cfg.Logger.Printf("cannot fetch component: %v", err)
}
case 4:
// fixed
incident.ComponentStatus = 1
}
requestType := "POST" requestType := "POST"
requestURL := "/incidents" requestURL := "/incidents"
if len(incident.ID) > 0 { if incident.ID > 0 {
requestType = "PUT" requestType = "PUT"
requestURL += "/" + string(incident.ID) requestURL += "/" + strconv.Itoa(incident.ID)
} }
resp, body, err := monitor.makeRequest(requestType, requestURL, jsonBytes) jsonBytes, _ := json.Marshal(incident)
resp, body, err := cfg.makeRequest(requestType, requestURL, jsonBytes)
if err != nil { if err != nil {
return err return err
} }
var data struct { var data struct {
Incident Incident `json:"data"` Incident struct {
ID int `json:"id"`
} `json:"data"`
} }
if err := json.Unmarshal(body, &data); err != nil { if err := json.Unmarshal(body, &data); err != nil {
return errors.New("Cannot parse incident body." + string(body)) return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body))
} else { }
incident.ID = data.Incident.ID
incident.Component = data.Incident.Component incident.ID = data.Incident.ID
if resp.StatusCode != 200 {
return fmt.Errorf("Could not create/update incident!")
}
return nil
}
func (incident *Incident) GetComponentStatus(cfg *CachetMonitor) (int, error) {
resp, body, err := cfg.makeRequest("GET", "/components/"+strconv.Itoa(incident.ComponentID), nil)
if err != nil {
return 0, err
} }
if resp.StatusCode != 200 { if resp.StatusCode != 200 {
return errors.New("Could not create/update incident!") return 0, fmt.Errorf("Invalid status code. Received %d", resp.StatusCode)
}
return nil
}
func (monitor *CachetMonitor) fetchComponent(componentID string) (*Component, error) {
_, body, err := monitor.makeRequest("GET", "/components/"+componentID, nil)
if err != nil {
return nil, err
} }
var data struct { var data struct {
Component Component `json:"data"` Component struct {
Status int `json:"status"`
} `json:"data"`
} }
if err := json.Unmarshal(body, &data); err != nil { if err := json.Unmarshal(body, &data); err != nil {
return nil, errors.New("Cannot parse component body. " + string(body)) return 0, fmt.Errorf("Cannot parse component body: %v. Err = %v", string(body), err)
} }
return &data.Component, nil return data.Component.Status, nil
}
func (monitor *CachetMonitor) UpdateComponent(incident *Incident) error {
if incident.ComponentID == nil || len(*incident.ComponentID) == 0 {
return nil
}
if incident.Component == nil {
// fetch component
component, err := monitor.fetchComponent(string(*incident.ComponentID))
if err != nil {
return fmt.Errorf("Cannot fetch component for incident. %v\n", err)
}
incident.Component = component
}
status, _ := strconv.Atoi(string(incident.Status))
switch status {
case 1, 2, 3:
if incident.Component.Status == "3" {
incident.Component.Status = "4"
} else {
incident.Component.Status = "3"
}
case 4:
incident.Component.Status = "1"
}
jsonBytes, _ := json.Marshal(map[string]interface{}{
"status": incident.Component.Status,
})
resp, _, err := monitor.makeRequest("PUT", "/components/"+string(incident.Component.ID), jsonBytes)
if err != nil || resp.StatusCode != 200 {
return fmt.Errorf("Could not update component: (resp code %d) %v", resp.StatusCode, err)
}
return nil
} }
// SetInvestigating sets status to Investigating // SetInvestigating sets status to Investigating
func (incident *Incident) SetInvestigating() { func (incident *Incident) SetInvestigating() {
incident.Status = "1" incident.Status = 1
incident.HumanStatus = "Investigating"
} }
// SetIdentified sets status to Identified // SetIdentified sets status to Identified
func (incident *Incident) SetIdentified() { func (incident *Incident) SetIdentified() {
incident.Status = "2" incident.Status = 2
incident.HumanStatus = "Identified"
} }
// SetWatching sets status to Watching // SetWatching sets status to Watching
func (incident *Incident) SetWatching() { func (incident *Incident) SetWatching() {
incident.Status = "3" incident.Status = 3
incident.HumanStatus = "Watching"
} }
// SetFixed sets status to Fixed // SetFixed sets status to Fixed
func (incident *Incident) SetFixed() { func (incident *Incident) SetFixed() {
incident.Status = "4" incident.Status = 4
incident.HumanStatus = "Fixed"
} }

View File

@@ -1,25 +0,0 @@
package cachet
import (
"encoding/json"
"fmt"
"strconv"
)
// SendMetric sends lag metric point
func (monitor *CachetMonitor) SendMetric(metricID int, delay int64) error {
if metricID <= 0 {
return nil
}
jsonBytes, _ := json.Marshal(&map[string]interface{}{
"value": delay,
})
resp, _, err := monitor.makeRequest("POST", "/metrics/"+strconv.Itoa(metricID)+"/points", jsonBytes)
if err != nil || resp.StatusCode != 200 {
return fmt.Errorf("Could not log data point!\n%v\n", err)
}
return nil
}

View File

@@ -2,103 +2,81 @@ package cachet
import ( import (
"crypto/tls" "crypto/tls"
"encoding/json" "errors"
"log" "fmt"
"io/ioutil"
"net/http" "net/http"
"os" "regexp"
"os/signal"
"strconv" "strconv"
"strings"
"sync" "sync"
"time" "time"
) )
const timeout = time.Duration(time.Second) const HttpTimeout = time.Duration(time.Second)
const DefaultInterval = 60
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
// Monitor data model // Monitor data model
type Monitor struct { type Monitor struct {
Name string `json:"name"` Name string `json:"name"`
URL string `json:"url"` URL string `json:"url"`
MetricID int `json:"metric_id"` Method string `json:"method"`
Threshold float32 `json:"threshold"` StrictTLS bool `json:"strict_tls"`
ComponentID *int `json:"component_id"` CheckInterval time.Duration `json:"interval"`
ExpectedStatusCode int `json:"expected_status_code"`
StrictTLS *bool `json:"strict_tls"`
Interval time.Duration `json:"interval"`
History []bool `json:"-"` MetricID int `json:"metric_id"`
LastFailReason *string `json:"-"` ComponentID int `json:"component_id"`
Incident *Incident `json:"-"`
// Threshold = percentage
Threshold float32 `json:"threshold"`
// Saturat
ExpectedStatusCode int `json:"expected_status_code"`
// compiled to Regexp
ExpectedBody string `json:"expected_body"`
bodyRegexp *regexp.Regexp
history []bool
lastFailReason string
incident *Incident
config *CachetMonitor config *CachetMonitor
// Closed when mon.Stop() is called // Closed when mon.Stop() is called
stopC chan bool stopC chan bool
} }
func (cfg *CachetMonitor) Run() { func (mon *Monitor) Start(cfg *CachetMonitor, wg *sync.WaitGroup) {
cfg.Logger.Printf("System: %s\nInterval: %d second(s)\nAPI: %s\n\n", cfg.SystemName, cfg.Interval, cfg.APIUrl) wg.Add(1)
cfg.Logger.Printf("Starting %d monitors:\n", len(cfg.Monitors)) mon.config = cfg
for _, mon := range cfg.Monitors { mon.stopC = make(chan bool)
cfg.Logger.Printf(" %s: GET %s & Expect HTTP %d\n", mon.Name, mon.URL, mon.ExpectedStatusCode)
if mon.MetricID > 0 { mon.config.Logger.Printf(" Starting %s: %d seconds check interval\n - %v %s", mon.Name, mon.CheckInterval, mon.Method, mon.URL)
cfg.Logger.Printf(" - Logs lag to metric id: %d\n", mon.MetricID)
// print features
if mon.ExpectedStatusCode > 0 {
mon.config.Logger.Printf(" - Expect HTTP %d", mon.ExpectedStatusCode)
}
if len(mon.ExpectedBody) > 0 {
mon.config.Logger.Printf(" - Expect Body to match \"%v\"", mon.ExpectedBody)
}
if mon.MetricID > 0 {
mon.config.Logger.Printf(" - Log lag to metric id %d\n", mon.MetricID)
}
if mon.ComponentID > 0 {
mon.config.Logger.Printf(" - Update component id %d\n\n", mon.ComponentID)
}
mon.Tick()
ticker := time.NewTicker(mon.CheckInterval * time.Second)
for {
select {
case <-ticker.C:
mon.Tick()
case <-mon.stopC:
wg.Done()
return
} }
if mon.ComponentID != nil && *mon.ComponentID > 0 {
cfg.Logger.Printf(" - Updates component id: %d\n", *mon.ComponentID)
}
}
cfg.Logger.Println()
wg := &sync.WaitGroup{}
for _, mon := range cfg.Monitors {
wg.Add(1)
mon.config = cfg
mon.stopC = make(chan bool)
go func(mon *Monitor) {
if mon.Interval < 1 {
mon.Interval = time.Duration(cfg.Interval)
}
ticker := time.NewTicker(mon.Interval * time.Second)
for {
select {
case <-ticker.C:
mon.Run()
case <-mon.StopC():
wg.Done()
return
}
}
}(mon)
}
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt, os.Kill)
<-signals
log.Println("Waiting monitors to end current operation")
for _, mon := range cfg.Monitors {
mon.Stop()
}
wg.Wait()
}
// Run loop
func (monitor *Monitor) Run() {
reqStart := getMs()
isUp := monitor.doRequest()
lag := getMs() - reqStart
if len(monitor.History) >= 10 {
monitor.History = monitor.History[len(monitor.History)-9:]
}
monitor.History = append(monitor.History, isUp)
monitor.AnalyseData()
if isUp == true && monitor.MetricID > 0 {
monitor.config.SendMetric(monitor.MetricID, lag)
} }
} }
@@ -110,10 +88,6 @@ func (monitor *Monitor) Stop() {
close(monitor.stopC) close(monitor.stopC)
} }
func (monitor *Monitor) StopC() <-chan bool {
return monitor.stopC
}
func (monitor *Monitor) Stopped() bool { func (monitor *Monitor) Stopped() bool {
select { select {
case <-monitor.stopC: case <-monitor.stopC:
@@ -123,11 +97,30 @@ func (monitor *Monitor) Stopped() bool {
} }
} }
func (monitor *Monitor) Tick() {
reqStart := getMs()
isUp := monitor.doRequest()
lag := getMs() - reqStart
if len(monitor.history) == 9 {
monitor.config.Logger.Printf("%v is now saturated\n", monitor.Name)
}
if len(monitor.history) >= 10 {
monitor.history = monitor.history[len(monitor.history)-9:]
}
monitor.history = append(monitor.history, isUp)
monitor.AnalyseData()
if isUp == true && monitor.MetricID > 0 {
monitor.config.SendMetric(monitor.MetricID, lag)
}
}
func (monitor *Monitor) doRequest() bool { func (monitor *Monitor) doRequest() bool {
client := &http.Client{ client := &http.Client{
Timeout: timeout, Timeout: HttpTimeout,
} }
if monitor.StrictTLS != nil && *monitor.StrictTLS == false { if monitor.StrictTLS == false {
client.Transport = &http.Transport{ client.Transport = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
} }
@@ -135,19 +128,36 @@ func (monitor *Monitor) doRequest() bool {
resp, err := client.Get(monitor.URL) resp, err := client.Get(monitor.URL)
if err != nil { if err != nil {
errString := err.Error() monitor.lastFailReason = err.Error()
monitor.LastFailReason = &errString
return false return false
} }
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode != monitor.ExpectedStatusCode { if monitor.ExpectedStatusCode > 0 && resp.StatusCode != monitor.ExpectedStatusCode {
failReason := "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode) monitor.lastFailReason = "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode)
monitor.LastFailReason = &failReason
return false return false
} }
if monitor.bodyRegexp != nil {
// check body
responseBody, err := ioutil.ReadAll(resp.Body)
if err != nil {
monitor.lastFailReason = err.Error()
return false
}
match := monitor.bodyRegexp.Match(responseBody)
if !match {
monitor.lastFailReason = "Unexpected body: " + string(responseBody) + ". Expected to match " + monitor.ExpectedBody
}
return match
}
return true return true
} }
@@ -155,60 +165,89 @@ func (monitor *Monitor) doRequest() bool {
func (monitor *Monitor) AnalyseData() { func (monitor *Monitor) AnalyseData() {
// look at the past few incidents // look at the past few incidents
numDown := 0 numDown := 0
for _, wasUp := range monitor.History { for _, wasUp := range monitor.history {
if wasUp == false { if wasUp == false {
numDown++ numDown++
} }
} }
t := (float32(numDown) / float32(len(monitor.History))) * 100 t := (float32(numDown) / float32(len(monitor.history))) * 100
monitor.config.Logger.Printf("%s %.2f%% Down at %v. Threshold: %.2f%%\n", monitor.URL, t, time.Now().UnixNano()/int64(time.Second), monitor.Threshold) monitor.config.Logger.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second))
if len(monitor.History) != 10 { if len(monitor.history) != 10 {
// not enough data // not saturated
return return
} }
if t > monitor.Threshold && monitor.Incident == nil { if t > monitor.Threshold && monitor.incident == nil {
// is down, create an incident monitor.incident = &Incident{
monitor.config.Logger.Println("Creating incident...")
component_id := json.Number(strconv.Itoa(*monitor.ComponentID))
monitor.Incident = &Incident{
Name: monitor.Name + " - " + monitor.config.SystemName, Name: monitor.Name + " - " + monitor.config.SystemName,
Message: monitor.Name + " check failed", ComponentID: monitor.ComponentID,
ComponentID: &component_id, Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat),
Notify: true,
} }
if monitor.LastFailReason != nil { if len(monitor.lastFailReason) > 0 {
monitor.Incident.Message += "\n\n - " + *monitor.LastFailReason monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`"
} }
// is down, create an incident
monitor.config.Logger.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
// set investigating status // set investigating status
monitor.Incident.SetInvestigating() monitor.incident.SetInvestigating()
// create/update incident // create/update incident
monitor.config.SendIncident(monitor.Incident) if err := monitor.incident.Send(monitor.config); err != nil {
monitor.config.UpdateComponent(monitor.Incident) monitor.config.Logger.Printf("Error sending incident: %v\n", err)
} else if t < monitor.Threshold && monitor.Incident != nil {
// was down, created an incident, its now ok, make it resolved.
monitor.config.Logger.Println("Updating incident to resolved...")
component_id := json.Number(strconv.Itoa(*monitor.ComponentID))
monitor.Incident = &Incident{
Name: monitor.Incident.Name,
Message: monitor.Name + " check succeeded",
ComponentID: &component_id,
} }
} else if t < monitor.Threshold && monitor.incident != nil {
// was down, created an incident, its now ok, make it resolved.
monitor.config.Logger.Printf("%v resolved downtime incident", monitor.Name)
monitor.Incident.SetFixed() // resolve incident
monitor.config.SendIncident(monitor.Incident) monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
monitor.config.UpdateComponent(monitor.Incident) monitor.incident.SetFixed()
monitor.incident.Send(monitor.config)
monitor.Incident = nil monitor.lastFailReason = ""
monitor.incident = nil
} }
} }
func getMs() int64 { func (monitor *Monitor) ValidateConfiguration() error {
return time.Now().UnixNano() / int64(time.Millisecond) if len(monitor.ExpectedBody) > 0 {
exp, err := regexp.Compile(monitor.ExpectedBody)
if err != nil {
return err
}
monitor.bodyRegexp = exp
}
if len(monitor.ExpectedBody) == 0 && monitor.ExpectedStatusCode == 0 {
return errors.New("Nothing to check, both 'expected_body' and 'expected_status_code' fields empty")
}
if monitor.CheckInterval < 1 {
monitor.CheckInterval = DefaultInterval
}
monitor.Method = strings.ToUpper(monitor.Method)
switch monitor.Method {
case "GET", "POST", "DELETE", "OPTIONS", "HEAD":
break
case "":
monitor.Method = "GET"
default:
return fmt.Errorf("Unsupported check method: %v", monitor.Method)
}
if monitor.ComponentID == 0 && monitor.MetricID == 0 {
return errors.New("component_id & metric_id are unset")
}
if monitor.Threshold <= 0 {
monitor.Threshold = 100
}
return nil
} }

View File

@@ -4,9 +4,9 @@ Features
-------- --------
- [x] Creates & Resolves Incidents - [x] Creates & Resolves Incidents
- [x] Posts monitor lag (interval configurable) - [x] Posts monitor lag to cachet graphs
- [x] Updates Component to Partial Outage - [x] Updates Component to Partial Outage
- [x] Updates Component to Major Outage if in Partial Outage - [x] Updates Component to Major Outage if already in Partial Outage (works well with distributed monitoring)
- [x] Can be run on multiple servers and geo regions - [x] Can be run on multiple servers and geo regions
Configuration Configuration
@@ -14,36 +14,26 @@ Configuration
``` ```
{ {
"api_url": "https://demo.cachethq.io/api/v1", "api_url": "https://<cachet domain>/api/v1",
"api_token": "<API TOKEN>", "api_token": "<cachet api token>",
"interval": 60, "insecure_api": false, // optional, false default, set if your certificate is self-signed/untrusted
"monitors": [ "monitors": [{
{ "name": "Name of your monitor", // required, friendly name for your monitor
"name": "Name of your monitor", "url": "Ping URL", // required, url to probe
"url": "Ping URL", "method": "get", // optional, http method (defaults GET)
"metric_id": <metric id from cachet>, "strict_tls": true, // self-signed ssl certificate
"component_id": <component id from cachet>, "interval": 10, // seconds between checks
"threshold": 80, "metric_id": <metric id>, // post lag to cachet metric (graph)
"expected_status_code": 200, "component_id": <component id>, // post incidents to this component
"strict_tls": true, "threshold": 80, // If % of downtime is over this threshold, open an incident
"interval": 5 "expected_status_code": 200, // optional, expected status code (either status code or body must be supplied)
} "expected_body": "P.*NG" // optional, regular expression
], }],
"insecure_api": false "system_name": "", // optional, system name to identify bot
"log_path": "" // optional, defaults to stdout
} }
``` ```
*Notes:*
- `metric_id` is optional
- `insecure_api` if true it will ignore HTTPS certificate errors (eg if self-signed)
- `strict_tls` if false (true is default) it will ignore HTTPS certificate errors (eg if monitor uses self-signed certificate)
- `component_id` is optional
- `threshold` is a percentage
- `expected_status_code` is a http response code
- `interval` is the duration in seconds between two checks.
- GET request will be performed on the `url`
Installation Installation
------------ ------------
@@ -63,11 +53,11 @@ Usage of cachet-monitor:
Environment variables Environment variables
--------------------- ---------------------
| Name | Example Value | Description | | Name | Example Value | Description |
| ------------ | --------------------------- | --------------------------- | | ------------ | ------------------------------ | --------------------------- |
| CACHET_API | http://demo.cachethq.io/api | URL endpoint for cachet api | | CACHET_API | http://demo.cachethq.io/api/v1 | URL endpoint for cachet api |
| CACHET_TOKEN | randomvalue | API Authentication token | | CACHET_TOKEN | APIToken123 | API Authentication token |
| CACHET_DEV | 1 | Strips logging | | CACHET_DEV | 1 | Strips logging |
Vision and goals Vision and goals
---------------- ----------------