Merge branch 'master' into gh-pages
* master: (40 commits) Add to readme, remove comment Update documentation More config options, markdown in incidents Update readme refactoring Vision readme better packaging Update go install guide update readme Implement 'inteval' config parameter -> number of seconds between checks Fix configuration example in README Update documentation for api url Fix v1 api url Add screenshot to readme Report new incident and set as fixed Update interval to 5 secs Update example config with default values Improve fail reasons, fix api crashes Add time interval Improve readme, Dockerfile ...
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
gin-bin
|
||||
example.config.local.json
|
||||
123
cli/main.go
Normal file
123
cli/main.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"sync"
|
||||
|
||||
cachet "github.com/castawaylabs/cachet-monitor"
|
||||
)
|
||||
|
||||
var configPath string
|
||||
var systemName string
|
||||
var logPath string
|
||||
|
||||
func main() {
|
||||
flag.StringVar(&configPath, "c", "/etc/cachet-monitor.config.json", "Config path")
|
||||
flag.StringVar(&systemName, "name", "", "System Name")
|
||||
flag.StringVar(&logPath, "log", "", "Log path")
|
||||
flag.Parse()
|
||||
|
||||
cfg, err := getConfiguration(configPath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if len(systemName) > 0 {
|
||||
cfg.SystemName = systemName
|
||||
}
|
||||
if len(logPath) > 0 {
|
||||
cfg.LogPath = logPath
|
||||
}
|
||||
|
||||
if len(os.Getenv("CACHET_API")) > 0 {
|
||||
cfg.APIUrl = os.Getenv("CACHET_API")
|
||||
}
|
||||
if len(os.Getenv("CACHET_TOKEN")) > 0 {
|
||||
cfg.APIToken = os.Getenv("CACHET_TOKEN")
|
||||
}
|
||||
|
||||
if err := cfg.ValidateConfiguration(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
cfg.Logger.Printf("System: %s\nAPI: %s\nMonitors: %d\n\n", cfg.SystemName, cfg.APIUrl, len(cfg.Monitors))
|
||||
|
||||
wg := &sync.WaitGroup{}
|
||||
for _, mon := range cfg.Monitors {
|
||||
go mon.Start(cfg, wg)
|
||||
}
|
||||
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, os.Interrupt, os.Kill)
|
||||
<-signals
|
||||
|
||||
cfg.Logger.Println("Abort: Waiting monitors to finish")
|
||||
for _, mon := range cfg.Monitors {
|
||||
mon.Stop()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func getLogger(logPath string) *log.Logger {
|
||||
var logWriter = os.Stdout
|
||||
var err error
|
||||
|
||||
if len(logPath) > 0 {
|
||||
logWriter, err = os.Create(logPath)
|
||||
if err != nil {
|
||||
fmt.Printf("Unable to open file '%v' for logging\n", logPath)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
flags := log.Llongfile | log.Ldate | log.Ltime
|
||||
if len(os.Getenv("CACHET_DEV")) > 0 {
|
||||
flags = 0
|
||||
}
|
||||
|
||||
return log.New(logWriter, "", flags)
|
||||
}
|
||||
|
||||
func getConfiguration(path string) (*cachet.CachetMonitor, error) {
|
||||
var cfg cachet.CachetMonitor
|
||||
var data []byte
|
||||
|
||||
// test if its a url
|
||||
url, err := url.ParseRequestURI(path)
|
||||
if err == nil && len(url.Scheme) > 0 {
|
||||
// download config
|
||||
response, err := http.Get(path)
|
||||
if err != nil {
|
||||
return nil, errors.New("Cannot download network config: " + err.Error())
|
||||
}
|
||||
|
||||
defer response.Body.Close()
|
||||
data, _ = ioutil.ReadAll(response.Body)
|
||||
|
||||
fmt.Println("Downloaded network configuration.")
|
||||
} else {
|
||||
data, err = ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, errors.New("Config file '" + path + "' missing!")
|
||||
}
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(data, &cfg); err != nil {
|
||||
fmt.Println(err)
|
||||
return nil, errors.New("Cannot parse config!")
|
||||
}
|
||||
|
||||
cfg.Logger = getLogger(cfg.LogPath)
|
||||
|
||||
return &cfg, nil
|
||||
}
|
||||
65
config.go
Normal file
65
config.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"log"
|
||||
"net"
|
||||
"os"
|
||||
)
|
||||
|
||||
type CachetMonitor struct {
|
||||
Logger *log.Logger `json:"-"`
|
||||
|
||||
APIUrl string `json:"api_url"`
|
||||
APIToken string `json:"api_token"`
|
||||
SystemName string `json:"system_name"`
|
||||
LogPath string `json:"log_path"`
|
||||
InsecureAPI bool `json:"insecure_api"`
|
||||
|
||||
Monitors []*Monitor `json:"monitors"`
|
||||
}
|
||||
|
||||
func (cfg *CachetMonitor) ValidateConfiguration() error {
|
||||
if cfg.Logger == nil {
|
||||
cfg.Logger = log.New(os.Stdout, "", log.Llongfile|log.Ldate|log.Ltime)
|
||||
}
|
||||
|
||||
if len(cfg.SystemName) == 0 {
|
||||
// get hostname
|
||||
cfg.SystemName = getHostname()
|
||||
}
|
||||
|
||||
if len(cfg.APIToken) == 0 || len(cfg.APIUrl) == 0 {
|
||||
return errors.New("API URL or API Token not set. cachet-monitor won't be able to report incidents.\n\nPlease set:\n CACHET_API and CACHET_TOKEN environment variable to override settings.\n\nGet help at https://github.com/castawaylabs/cachet-monitor\n")
|
||||
}
|
||||
|
||||
if len(cfg.Monitors) == 0 {
|
||||
return errors.New("No monitors defined!\nSee sample configuration: https://github.com/castawaylabs/cachet-monitor/blob/master/example.config.json\n")
|
||||
}
|
||||
|
||||
for _, monitor := range cfg.Monitors {
|
||||
if err := monitor.ValidateConfiguration(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// getHostname returns id of the current system
|
||||
func getHostname() string {
|
||||
hostname, err := os.Hostname()
|
||||
if err != nil || len(hostname) == 0 {
|
||||
addrs, err := net.InterfaceAddrs()
|
||||
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
for _, addr := range addrs {
|
||||
return addr.String()
|
||||
}
|
||||
}
|
||||
|
||||
return hostname
|
||||
}
|
||||
17
example.config.json
Normal file
17
example.config.json
Normal file
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"api_url": "https://demo.cachethq.io/api/v1",
|
||||
"api_token": "9yMHsdioQosnyVK4iCVR",
|
||||
"interval": 5,
|
||||
"monitors": [
|
||||
{
|
||||
"name": "nodegear frontend",
|
||||
"url": "https://nodegear.io/ping",
|
||||
"metric_id": 1,
|
||||
"threshold": 80,
|
||||
"component_id": null,
|
||||
"expected_status_code": 200,
|
||||
"strict_tls": true
|
||||
}
|
||||
],
|
||||
"insecure_api": false
|
||||
}
|
||||
58
http.go
Normal file
58
http.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBody []byte) (*http.Response, []byte, error) {
|
||||
req, err := http.NewRequest(requestType, monitor.APIUrl+url, bytes.NewBuffer(reqBody))
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("X-Cachet-Token", monitor.APIToken)
|
||||
|
||||
client := &http.Client{}
|
||||
if monitor.InsecureAPI == true {
|
||||
client.Transport = &http.Transport{
|
||||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||
}
|
||||
}
|
||||
|
||||
res, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, []byte{}, err
|
||||
}
|
||||
|
||||
defer res.Body.Close()
|
||||
body, _ := ioutil.ReadAll(res.Body)
|
||||
|
||||
return res, body, nil
|
||||
}
|
||||
|
||||
// SendMetric sends lag metric point
|
||||
func (monitor *Monitor) SendMetric(delay int64) error {
|
||||
if monitor.MetricID == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
jsonBytes, _ := json.Marshal(&map[string]interface{}{
|
||||
"value": delay,
|
||||
})
|
||||
|
||||
resp, _, err := monitor.config.makeRequest("POST", "/metrics/"+strconv.Itoa(monitor.MetricID)+"/points", jsonBytes)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return fmt.Errorf("Could not log data point!\n%v\n", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMs() int64 {
|
||||
return time.Now().UnixNano() / int64(time.Millisecond)
|
||||
}
|
||||
114
incident.go
Normal file
114
incident.go
Normal file
@@ -0,0 +1,114 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// Incident Cachet data model
|
||||
type Incident struct {
|
||||
ID int `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Message string `json:"message"`
|
||||
Status int `json:"status"`
|
||||
Visible int `json"visible"`
|
||||
Notify bool `json:"notify"`
|
||||
|
||||
ComponentID int `json:"component_id"`
|
||||
ComponentStatus int `json:"component_status"`
|
||||
}
|
||||
|
||||
// Send - Create or Update incident
|
||||
func (incident *Incident) Send(cfg *CachetMonitor) error {
|
||||
switch incident.Status {
|
||||
case 1, 2, 3:
|
||||
// partial outage
|
||||
incident.ComponentStatus = 3
|
||||
|
||||
componentStatus, err := incident.GetComponentStatus(cfg)
|
||||
if componentStatus == 3 {
|
||||
// major outage
|
||||
incident.ComponentStatus = 4
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
cfg.Logger.Printf("cannot fetch component: %v", err)
|
||||
}
|
||||
case 4:
|
||||
// fixed
|
||||
incident.ComponentStatus = 1
|
||||
}
|
||||
|
||||
requestType := "POST"
|
||||
requestURL := "/incidents"
|
||||
if incident.ID > 0 {
|
||||
requestType = "PUT"
|
||||
requestURL += "/" + strconv.Itoa(incident.ID)
|
||||
}
|
||||
|
||||
jsonBytes, _ := json.Marshal(incident)
|
||||
|
||||
resp, body, err := cfg.makeRequest(requestType, requestURL, jsonBytes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var data struct {
|
||||
Incident struct {
|
||||
ID int `json:"id"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &data); err != nil {
|
||||
return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body))
|
||||
}
|
||||
|
||||
incident.ID = data.Incident.ID
|
||||
if resp.StatusCode != 200 {
|
||||
return fmt.Errorf("Could not create/update incident!")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (incident *Incident) GetComponentStatus(cfg *CachetMonitor) (int, error) {
|
||||
resp, body, err := cfg.makeRequest("GET", "/components/"+strconv.Itoa(incident.ComponentID), nil)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return 0, fmt.Errorf("Invalid status code. Received %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var data struct {
|
||||
Component struct {
|
||||
Status int `json:"status"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &data); err != nil {
|
||||
return 0, fmt.Errorf("Cannot parse component body: %v. Err = %v", string(body), err)
|
||||
}
|
||||
|
||||
return data.Component.Status, nil
|
||||
}
|
||||
|
||||
// SetInvestigating sets status to Investigating
|
||||
func (incident *Incident) SetInvestigating() {
|
||||
incident.Status = 1
|
||||
}
|
||||
|
||||
// SetIdentified sets status to Identified
|
||||
func (incident *Incident) SetIdentified() {
|
||||
incident.Status = 2
|
||||
}
|
||||
|
||||
// SetWatching sets status to Watching
|
||||
func (incident *Incident) SetWatching() {
|
||||
incident.Status = 3
|
||||
}
|
||||
|
||||
// SetFixed sets status to Fixed
|
||||
func (incident *Incident) SetFixed() {
|
||||
incident.Status = 4
|
||||
}
|
||||
252
monitor.go
Normal file
252
monitor.go
Normal file
@@ -0,0 +1,252 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
const HttpTimeout = time.Duration(time.Second)
|
||||
const DefaultInterval = 60
|
||||
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
|
||||
|
||||
// Monitor data model
|
||||
type Monitor struct {
|
||||
Name string `json:"name"`
|
||||
URL string `json:"url"`
|
||||
Method string `json:"method"`
|
||||
StrictTLS bool `json:"strict_tls"`
|
||||
CheckInterval time.Duration `json:"interval"`
|
||||
|
||||
MetricID int `json:"metric_id"`
|
||||
ComponentID int `json:"component_id"`
|
||||
|
||||
// Threshold = percentage
|
||||
Threshold float32 `json:"threshold"`
|
||||
ExpectedStatusCode int `json:"expected_status_code"`
|
||||
// compiled to Regexp
|
||||
ExpectedBody string `json:"expected_body"`
|
||||
bodyRegexp *regexp.Regexp
|
||||
|
||||
history []bool
|
||||
lastFailReason string
|
||||
incident *Incident
|
||||
config *CachetMonitor
|
||||
|
||||
// Closed when mon.Stop() is called
|
||||
stopC chan bool
|
||||
}
|
||||
|
||||
func (mon *Monitor) Start(cfg *CachetMonitor, wg *sync.WaitGroup) {
|
||||
wg.Add(1)
|
||||
mon.config = cfg
|
||||
mon.stopC = make(chan bool)
|
||||
|
||||
mon.config.Logger.Printf(" Starting %s: %d seconds check interval\n - %v %s", mon.Name, mon.CheckInterval, mon.Method, mon.URL)
|
||||
|
||||
// print features
|
||||
if mon.ExpectedStatusCode > 0 {
|
||||
mon.config.Logger.Printf(" - Expect HTTP %d", mon.ExpectedStatusCode)
|
||||
}
|
||||
if len(mon.ExpectedBody) > 0 {
|
||||
mon.config.Logger.Printf(" - Expect Body to match \"%v\"", mon.ExpectedBody)
|
||||
}
|
||||
if mon.MetricID > 0 {
|
||||
mon.config.Logger.Printf(" - Log lag to metric id %d\n", mon.MetricID)
|
||||
}
|
||||
if mon.ComponentID > 0 {
|
||||
mon.config.Logger.Printf(" - Update component id %d\n\n", mon.ComponentID)
|
||||
}
|
||||
|
||||
mon.Tick()
|
||||
|
||||
ticker := time.NewTicker(mon.CheckInterval * time.Second)
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
mon.Tick()
|
||||
case <-mon.stopC:
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (monitor *Monitor) Stop() {
|
||||
if monitor.Stopped() {
|
||||
return
|
||||
}
|
||||
|
||||
close(monitor.stopC)
|
||||
}
|
||||
|
||||
func (monitor *Monitor) Stopped() bool {
|
||||
select {
|
||||
case <-monitor.stopC:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (monitor *Monitor) Tick() {
|
||||
reqStart := getMs()
|
||||
isUp := monitor.doRequest()
|
||||
lag := getMs() - reqStart
|
||||
|
||||
if len(monitor.history) == 9 {
|
||||
monitor.config.Logger.Printf("%v is now saturated\n", monitor.Name)
|
||||
}
|
||||
if len(monitor.history) >= 10 {
|
||||
monitor.history = monitor.history[len(monitor.history)-9:]
|
||||
}
|
||||
monitor.history = append(monitor.history, isUp)
|
||||
monitor.AnalyseData()
|
||||
|
||||
if isUp == true && monitor.MetricID > 0 {
|
||||
monitor.SendMetric(lag)
|
||||
}
|
||||
}
|
||||
|
||||
func (monitor *Monitor) doRequest() bool {
|
||||
client := &http.Client{
|
||||
Timeout: HttpTimeout,
|
||||
}
|
||||
if monitor.StrictTLS == false {
|
||||
client.Transport = &http.Transport{
|
||||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := client.Get(monitor.URL)
|
||||
if err != nil {
|
||||
monitor.lastFailReason = err.Error()
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
if monitor.ExpectedStatusCode > 0 && resp.StatusCode != monitor.ExpectedStatusCode {
|
||||
monitor.lastFailReason = "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode)
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
if monitor.bodyRegexp != nil {
|
||||
// check body
|
||||
responseBody, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
monitor.lastFailReason = err.Error()
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
match := monitor.bodyRegexp.Match(responseBody)
|
||||
if !match {
|
||||
monitor.lastFailReason = "Unexpected body: " + string(responseBody) + ". Expected to match " + monitor.ExpectedBody
|
||||
}
|
||||
|
||||
return match
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
|
||||
func (monitor *Monitor) AnalyseData() {
|
||||
// look at the past few incidents
|
||||
numDown := 0
|
||||
for _, wasUp := range monitor.history {
|
||||
if wasUp == false {
|
||||
numDown++
|
||||
}
|
||||
}
|
||||
|
||||
t := (float32(numDown) / float32(len(monitor.history))) * 100
|
||||
monitor.config.Logger.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second))
|
||||
|
||||
if len(monitor.history) != 10 {
|
||||
// not saturated
|
||||
return
|
||||
}
|
||||
|
||||
if t > monitor.Threshold && monitor.incident == nil {
|
||||
monitor.incident = &Incident{
|
||||
Name: monitor.Name + " - " + monitor.config.SystemName,
|
||||
ComponentID: monitor.ComponentID,
|
||||
Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat),
|
||||
Notify: true,
|
||||
}
|
||||
|
||||
if len(monitor.lastFailReason) > 0 {
|
||||
monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`"
|
||||
}
|
||||
|
||||
// is down, create an incident
|
||||
monitor.config.Logger.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
|
||||
// set investigating status
|
||||
monitor.incident.SetInvestigating()
|
||||
// create/update incident
|
||||
if err := monitor.incident.Send(monitor.config); err != nil {
|
||||
monitor.config.Logger.Printf("Error sending incident: %v\n", err)
|
||||
}
|
||||
} else if t < monitor.Threshold && monitor.incident != nil {
|
||||
// was down, created an incident, its now ok, make it resolved.
|
||||
monitor.config.Logger.Printf("%v resolved downtime incident", monitor.Name)
|
||||
|
||||
// resolve incident
|
||||
monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
|
||||
monitor.incident.SetFixed()
|
||||
monitor.incident.Send(monitor.config)
|
||||
|
||||
monitor.lastFailReason = ""
|
||||
monitor.incident = nil
|
||||
}
|
||||
}
|
||||
|
||||
func (monitor *Monitor) ValidateConfiguration() error {
|
||||
if len(monitor.ExpectedBody) > 0 {
|
||||
exp, err := regexp.Compile(monitor.ExpectedBody)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
monitor.bodyRegexp = exp
|
||||
}
|
||||
|
||||
if len(monitor.ExpectedBody) == 0 && monitor.ExpectedStatusCode == 0 {
|
||||
return errors.New("Nothing to check, both 'expected_body' and 'expected_status_code' fields empty")
|
||||
}
|
||||
|
||||
if monitor.CheckInterval < 1 {
|
||||
monitor.CheckInterval = DefaultInterval
|
||||
}
|
||||
|
||||
monitor.Method = strings.ToUpper(monitor.Method)
|
||||
switch monitor.Method {
|
||||
case "GET", "POST", "DELETE", "OPTIONS", "HEAD":
|
||||
break
|
||||
case "":
|
||||
monitor.Method = "GET"
|
||||
default:
|
||||
return fmt.Errorf("Unsupported check method: %v", monitor.Method)
|
||||
}
|
||||
|
||||
if monitor.ComponentID == 0 && monitor.MetricID == 0 {
|
||||
return errors.New("component_id & metric_id are unset")
|
||||
}
|
||||
|
||||
if monitor.Threshold <= 0 {
|
||||
monitor.Threshold = 100
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
91
readme.md
Normal file
91
readme.md
Normal file
@@ -0,0 +1,91 @@
|
||||

|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
- [x] Creates & Resolves Incidents
|
||||
- [x] Posts monitor lag to cachet graphs
|
||||
- [x] Updates Component to Partial Outage
|
||||
- [x] Updates Component to Major Outage if already in Partial Outage (works well with distributed monitoring)
|
||||
- [x] Can be run on multiple servers and geo regions
|
||||
|
||||
Configuration
|
||||
-------------
|
||||
|
||||
```
|
||||
{
|
||||
// URL for the API. Note: Must end with /api/v1
|
||||
"api_url": "https://<cachet domain>/api/v1",
|
||||
// Your API token for Cachet
|
||||
"api_token": "<cachet api token>",
|
||||
// optional, false default, set if your certificate is self-signed/untrusted
|
||||
"insecure_api": false,
|
||||
"monitors": [{
|
||||
// required, friendly name for your monitor
|
||||
"name": "Name of your monitor",
|
||||
// required, url to probe
|
||||
"url": "Ping URL",
|
||||
// optional, http method (defaults GET)
|
||||
"method": "get",
|
||||
// self-signed ssl certificate
|
||||
"strict_tls": true,
|
||||
// seconds between checks
|
||||
"interval": 10,
|
||||
// post lag to cachet metric (graph)
|
||||
// note either metric ID or component ID are required
|
||||
"metric_id": <metric id>,
|
||||
// post incidents to this component
|
||||
"component_id": <component id>,
|
||||
// If % of downtime is over this threshold, open an incident
|
||||
"threshold": 80,
|
||||
// optional, expected status code (either status code or body must be supplied)
|
||||
"expected_status_code": 200,
|
||||
// optional, regular expression to match body content
|
||||
"expected_body": "P.*NG"
|
||||
}],
|
||||
// optional, system name to identify bot (uses hostname by default)
|
||||
"system_name": "",
|
||||
// optional, defaults to stdout
|
||||
"log_path": ""
|
||||
}
|
||||
```
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
1. Download binary from [release page](https://github.com/CastawayLabs/cachet-monitor/releases)
|
||||
2. Create your configuration ([example](https://raw.githubusercontent.com/CastawayLabs/cachet-monitor/master/example.config.json))
|
||||
3. `cachet-monitor -c /etc/cachet-monitor.config.json`
|
||||
|
||||
pro tip: run in background using `nohup cachet-monitor 2>&1 > /var/log/cachet-monitor.log &`
|
||||
|
||||
```
|
||||
Usage of cachet-monitor:
|
||||
-c="/etc/cachet-monitor.config.json": Config path
|
||||
-log="": Log path
|
||||
-name="": System Name
|
||||
```
|
||||
|
||||
Environment variables
|
||||
---------------------
|
||||
|
||||
| Name | Example Value | Description |
|
||||
| ------------ | ------------------------------ | --------------------------- |
|
||||
| CACHET_API | http://demo.cachethq.io/api/v1 | URL endpoint for cachet api |
|
||||
| CACHET_TOKEN | APIToken123 | API Authentication token |
|
||||
| CACHET_DEV | 1 | Strips logging |
|
||||
|
||||
Vision and goals
|
||||
----------------
|
||||
|
||||
We made this tool because we felt the need to have our own monitoring software (leveraging on Cachet).
|
||||
The idea is a stateless program which collects data and pushes it to a central cachet instance.
|
||||
|
||||
This gives us power to have an army of geographically distributed loggers and reveal issues in both latency & downtime on client websites.
|
||||
|
||||
Package usage
|
||||
-------------
|
||||
|
||||
When using `cachet-monitor` as a package in another program, you should follow what `cli/main.go` does. It is important to call `ValidateConfiguration` on `CachetMonitor` and all the monitors inside.
|
||||
|
||||
[API Documentation](https://godoc.org/github.com/CastawayLabs/cachet-monitor)
|
||||
Reference in New Issue
Block a user