huge refactor
- extendable backends - better project structure - better cli interface
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,3 +1,5 @@
|
||||
/config.yml
|
||||
/config.json
|
||||
examples/
|
||||
vendor/
|
||||
cachet-monitor
|
||||
|
||||
85
api.go
85
api.go
@@ -1,85 +0,0 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
)
|
||||
|
||||
type CachetAPI struct {
|
||||
URL string `json:"url"`
|
||||
Token string `json:"token"`
|
||||
Insecure bool `json:"insecure"`
|
||||
}
|
||||
|
||||
type CachetResponse struct {
|
||||
Data json.RawMessage `json:"data"`
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
func (api CachetAPI) Ping() error {
|
||||
resp, _, err := api.NewRequest("GET", "/ping", nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return errors.New("API Responded with non-200 status code")
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// SendMetric adds a data point to a cachet monitor
|
||||
func (api CachetAPI) SendMetric(id int, lag int64) {
|
||||
logrus.Debugf("Sending lag metric ID:%d RTT %vms", id, lag)
|
||||
|
||||
jsonBytes, _ := json.Marshal(map[string]interface{}{
|
||||
"value": lag,
|
||||
"timestamp": time.Now().Unix(),
|
||||
})
|
||||
|
||||
resp, _, err := api.NewRequest("POST", "/metrics/"+strconv.Itoa(id)+"/points", jsonBytes)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
logrus.Warnf("Could not log metric! ID: %d, err: %v", id, err)
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
// NewRequest wraps http.NewRequest
|
||||
func (api CachetAPI) NewRequest(requestType, url string, reqBody []byte) (*http.Response, CachetResponse, error) {
|
||||
req, err := http.NewRequest(requestType, api.URL+url, bytes.NewBuffer(reqBody))
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("X-Cachet-Token", api.Token)
|
||||
|
||||
transport := http.DefaultTransport.(*http.Transport)
|
||||
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: api.Insecure}
|
||||
client := &http.Client{
|
||||
Transport: transport,
|
||||
}
|
||||
|
||||
res, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, CachetResponse{}, err
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Data json.RawMessage `json:"data"`
|
||||
}
|
||||
err = json.NewDecoder(res.Body).Decode(&body)
|
||||
|
||||
defer req.Body.Close()
|
||||
|
||||
return res, body, err
|
||||
}
|
||||
288
backends/cachet/backend.go
Normal file
288
backends/cachet/backend.go
Normal file
@@ -0,0 +1,288 @@
|
||||
package cachetbackend
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/castawaylabs/cachet-monitor/monitors"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
|
||||
|
||||
type CachetBackend struct {
|
||||
URL string `json:"url" yaml:"url"`
|
||||
Token string `json:"token" yaml:"token"`
|
||||
Insecure bool `json:"insecure" yaml:"insecure"`
|
||||
DateFormat string `json:"date_format" yaml:"date_format"`
|
||||
}
|
||||
|
||||
type CachetResponse struct {
|
||||
Data json.RawMessage `json:"data"`
|
||||
}
|
||||
|
||||
func (api CachetBackend) ValidateMonitor(mon *monitors.AbstractMonitor) []string {
|
||||
errs := []string{}
|
||||
|
||||
params := mon.Params
|
||||
|
||||
componentID, componentIDOk := params["component_id"]
|
||||
metricID, metricIDOk := params["metric_id"]
|
||||
if !componentIDOk && !metricIDOk {
|
||||
errs = append(errs, "component_id and metric_id is unset")
|
||||
}
|
||||
|
||||
if _, ok := componentID.(int); !ok && componentIDOk {
|
||||
errs = append(errs, "component_id not integer")
|
||||
}
|
||||
if _, ok := metricID.(int); !ok && metricIDOk {
|
||||
errs = append(errs, "metric_id not integer")
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
func (api CachetBackend) Validate() []string {
|
||||
errs := []string{}
|
||||
|
||||
if len(api.URL) == 0 {
|
||||
errs = append(errs, "Cachet API URL invalid")
|
||||
}
|
||||
if len(api.Token) == 0 {
|
||||
errs = append(errs, "Cachet API Token invalid")
|
||||
}
|
||||
|
||||
if len(api.DateFormat) == 0 {
|
||||
api.DateFormat = DefaultTimeFormat
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
func (api CachetBackend) Ping() error {
|
||||
resp, _, err := api.NewRequest("GET", "/ping", nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return errors.New("API Responded with non-200 status code")
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
// NewRequest wraps http.NewRequest
|
||||
func (api CachetBackend) NewRequest(requestType, url string, reqBody []byte) (*http.Response, interface{}, error) {
|
||||
req, err := http.NewRequest(requestType, api.URL+url, bytes.NewBuffer(reqBody))
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("X-Cachet-Token", api.Token)
|
||||
|
||||
transport := http.DefaultTransport.(*http.Transport)
|
||||
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: api.Insecure}
|
||||
client := &http.Client{
|
||||
Transport: transport,
|
||||
}
|
||||
|
||||
res, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, CachetResponse{}, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
defer req.Body.Close()
|
||||
|
||||
var body struct {
|
||||
Data json.RawMessage `json:"data"`
|
||||
}
|
||||
err = json.NewDecoder(res.Body).Decode(&body)
|
||||
|
||||
return res, body, err
|
||||
}
|
||||
|
||||
func (mon CachetBackend) Describe() []string {
|
||||
features := []string{"Cachet API"}
|
||||
|
||||
return features
|
||||
}
|
||||
|
||||
func (api CachetBackend) SendMetric(monitor monitors.MonitorInterface, lag int64) error {
|
||||
mon := monitor.GetMonitor()
|
||||
if _, ok := mon.Params["metric_id"]; !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
metricID := mon.Params["metric_id"].(int)
|
||||
|
||||
// report lag
|
||||
logrus.Debugf("Sending lag metric ID: %d RTT %vms", metricID, lag)
|
||||
|
||||
jsonBytes, _ := json.Marshal(map[string]interface{}{
|
||||
"value": lag,
|
||||
"timestamp": time.Now().Unix(),
|
||||
})
|
||||
|
||||
resp, _, err := api.NewRequest("POST", "/metrics/"+strconv.Itoa(metricID)+"/points", jsonBytes)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
logrus.Warnf("Could not log metric! ID: %d, err: %v", metricID, err)
|
||||
}
|
||||
|
||||
if resp != nil && resp.Body != nil {
|
||||
defer resp.Body.Close()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (api CachetBackend) UpdateMonitor(mon monitors.MonitorInterface, status, previousStatus monitors.MonitorStatus, errs []error) error {
|
||||
monitor := mon.GetMonitor()
|
||||
l := logrus.WithFields(logrus.Fields{
|
||||
"monitor": monitor.Name,
|
||||
"time": time.Now().Format(api.DateFormat),
|
||||
})
|
||||
|
||||
errors := make([]string, len(errs))
|
||||
for i, err := range errs {
|
||||
errors[i] = err.Error()
|
||||
}
|
||||
|
||||
fmt.Println("errs", errs)
|
||||
|
||||
componentID := monitor.Params["component_id"].(int)
|
||||
incident, err := api.findIncident(componentID)
|
||||
if err != nil {
|
||||
l.Errorf("Couldn't find existing incidents: %v", err)
|
||||
}
|
||||
|
||||
if incident == nil {
|
||||
// create a new one
|
||||
incident = &Incident{
|
||||
Name: "",
|
||||
ComponentID: componentID,
|
||||
Message: "",
|
||||
Notify: true,
|
||||
}
|
||||
} else {
|
||||
// find component status
|
||||
component, err := api.getComponent(incident.ComponentID)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
incident.ComponentStatus = component.Status
|
||||
}
|
||||
|
||||
tpls := monitor.Template
|
||||
tplData := api.getTemplateData(monitor)
|
||||
var tpl monitors.MessageTemplate
|
||||
|
||||
if status == monitors.MonitorStatusDown {
|
||||
tpl = tpls.Investigating
|
||||
tplData["FailReason"] = strings.Join(errors, "\n - ")
|
||||
l.Warnf("updating component. Monitor is down: %v", tplData["FailReason"])
|
||||
} else {
|
||||
// was down, created an incident, its now ok, make it resolved.
|
||||
tpl = tpls.Fixed
|
||||
l.Warn("Resolving incident")
|
||||
}
|
||||
|
||||
tplData["incident"] = incident
|
||||
subject, message := tpl.Exec(tplData)
|
||||
|
||||
if incident.ID == 0 {
|
||||
incident.Name = subject
|
||||
incident.Message = message
|
||||
} else {
|
||||
incident.Message += "\n\n---\n\n" + subject + ":\n\n" + message
|
||||
}
|
||||
|
||||
if status == monitors.MonitorStatusDown && (incident.ComponentStatus == 0 || incident.ComponentStatus > 2) {
|
||||
incident.Status = 1
|
||||
fmt.Println("incident status", incident.ComponentStatus)
|
||||
if incident.ComponentStatus >= 3 {
|
||||
// major outage
|
||||
incident.ComponentStatus = 4
|
||||
} else {
|
||||
incident.ComponentStatus = 3
|
||||
}
|
||||
} else if status == monitors.MonitorStatusUp {
|
||||
incident.Status = 4
|
||||
incident.ComponentStatus = 1
|
||||
}
|
||||
incident.Notify = true
|
||||
|
||||
// create/update incident
|
||||
if err := incident.Send(api); err != nil {
|
||||
l.Errorf("Error sending incident: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (api CachetBackend) Tick(monitor monitors.MonitorInterface, status monitors.MonitorStatus, errs []error, lag int64) {
|
||||
mon := monitor.GetMonitor()
|
||||
if mon.GetLastStatus() == status || status == monitors.MonitorStatusNotSaturated {
|
||||
return
|
||||
}
|
||||
|
||||
logrus.Infof("updating backend for monitor")
|
||||
lastStatus := mon.UpdateLastStatus(status)
|
||||
|
||||
api.UpdateMonitor(monitor, status, lastStatus, errs)
|
||||
|
||||
if _, ok := mon.Params["metric_id"]; ok && lag > 0 {
|
||||
api.SendMetric(monitor, lag)
|
||||
}
|
||||
}
|
||||
|
||||
func (api CachetBackend) getComponent(componentID int) (*Component, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (api CachetBackend) findIncident(componentID int) (*Incident, error) {
|
||||
// fetch watching, identified & investigating
|
||||
statuses := []int{3, 2, 1}
|
||||
for _, status := range statuses {
|
||||
incidents, err := api.findIncidents(componentID, status)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, incident := range incidents {
|
||||
incident.Status = status
|
||||
return incident, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (api CachetBackend) findIncidents(componentID int, status int) ([]*Incident, error) {
|
||||
resp, body, err := api.NewRequest("GET", "/incidents?component_id="+strconv.Itoa(componentID)+"&status="+strconv.Itoa(status), nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var data []*Incident
|
||||
if err := json.Unmarshal(body.(CachetResponse).Data, &data); err != nil {
|
||||
return nil, fmt.Errorf("Cannot find incidents: %v", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("Could not fetch incidents! %v", err)
|
||||
}
|
||||
|
||||
return data, nil
|
||||
}
|
||||
14
backends/cachet/component.go
Normal file
14
backends/cachet/component.go
Normal file
@@ -0,0 +1,14 @@
|
||||
package cachetbackend
|
||||
|
||||
// Incident Cachet data model
|
||||
type Component struct {
|
||||
ID int `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Message string `json:"message"`
|
||||
Status int `json:"status"`
|
||||
Visible int `json:"visible"`
|
||||
Notify bool `json:"notify"`
|
||||
|
||||
ComponentID int `json:"component_id"`
|
||||
ComponentStatus int `json:"component_status"`
|
||||
}
|
||||
71
backends/cachet/incident.go
Normal file
71
backends/cachet/incident.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package cachetbackend
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/castawaylabs/cachet-monitor/backends"
|
||||
"github.com/castawaylabs/cachet-monitor/monitors"
|
||||
)
|
||||
|
||||
// "encoding/json"
|
||||
// "fmt"
|
||||
// "strconv"
|
||||
|
||||
// "github.com/sirupsen/logrus"
|
||||
|
||||
// Incident Cachet data model
|
||||
type Incident struct {
|
||||
ID int `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Message string `json:"message"`
|
||||
Status int `json:"status"`
|
||||
Visible int `json:"visible"`
|
||||
Notify bool `json:"notify"`
|
||||
|
||||
ComponentID int `json:"component_id"`
|
||||
ComponentStatus int `json:"component_status"`
|
||||
}
|
||||
|
||||
// Send - Create or Update incident
|
||||
func (incident *Incident) Send(backend backends.BackendInterface) error {
|
||||
requestURL := "/incidents"
|
||||
requestMethod := "POST"
|
||||
jsonBytes, _ := json.Marshal(incident)
|
||||
|
||||
if incident.ID > 0 {
|
||||
// create an incident update
|
||||
requestMethod = "PUT"
|
||||
requestURL += "/" + strconv.Itoa(incident.ID)
|
||||
}
|
||||
|
||||
resp, body, err := backend.NewRequest(requestMethod, requestURL, jsonBytes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var data struct {
|
||||
ID int `json:"id"`
|
||||
}
|
||||
if err := json.Unmarshal(body.(CachetResponse).Data, &data); err != nil {
|
||||
return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body.(CachetResponse).Data))
|
||||
}
|
||||
|
||||
incident.ID = data.ID
|
||||
if resp.StatusCode != 200 {
|
||||
return fmt.Errorf("Could not update/create incident!")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (api *CachetBackend) getTemplateData(monitor *monitors.AbstractMonitor) map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
// "SystemName": monitor.config.SystemName,
|
||||
"Monitor": monitor,
|
||||
"now": time.Now().Format(api.DateFormat),
|
||||
// "incident": monitor.incident,
|
||||
}
|
||||
}
|
||||
19
backends/interface.go
Normal file
19
backends/interface.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package backends
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"github.com/castawaylabs/cachet-monitor/monitors"
|
||||
)
|
||||
|
||||
type BackendInterface interface {
|
||||
Ping() error
|
||||
Tick(monitor monitors.MonitorInterface, status monitors.MonitorStatus, errs []error, lag int64)
|
||||
SendMetric(monitor monitors.MonitorInterface, lag int64) error
|
||||
UpdateMonitor(monitor monitors.MonitorInterface, status, previousStatus monitors.MonitorStatus, errs []error) error
|
||||
NewRequest(requestType, url string, reqBody []byte) (*http.Response, interface{}, error)
|
||||
|
||||
Describe() []string
|
||||
Validate() []string
|
||||
ValidateMonitor(monitor *monitors.AbstractMonitor) []string
|
||||
}
|
||||
195
cli/main.go
195
cli/main.go
@@ -1,198 +1,5 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
cachet "github.com/castawaylabs/cachet-monitor"
|
||||
docopt "github.com/docopt/docopt-go"
|
||||
"github.com/mitchellh/mapstructure"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
const usage = `cachet-monitor
|
||||
|
||||
Usage:
|
||||
cachet-monitor (-c PATH | --config PATH) [--log=LOGPATH] [--name=NAME] [--immediate]
|
||||
cachet-monitor -h | --help | --version
|
||||
|
||||
Arguments:
|
||||
PATH path to config.json
|
||||
LOGPATH path to log output (defaults to STDOUT)
|
||||
NAME name of this logger
|
||||
|
||||
Examples:
|
||||
cachet-monitor -c /root/cachet-monitor.json
|
||||
cachet-monitor -c /root/cachet-monitor.json --log=/var/log/cachet-monitor.log --name="development machine"
|
||||
|
||||
Options:
|
||||
-c PATH.json --config PATH Path to configuration file
|
||||
-h --help Show this screen.
|
||||
--version Show version
|
||||
--immediate Tick immediately (by default waits for first defined interval)
|
||||
|
||||
Environment varaibles:
|
||||
CACHET_API override API url from configuration
|
||||
CACHET_TOKEN override API token from configuration
|
||||
CACHET_DEV set to enable dev logging`
|
||||
|
||||
var version string
|
||||
|
||||
func main() {
|
||||
arguments, _ := docopt.Parse(usage, nil, true, version, false)
|
||||
|
||||
cfg, err := getConfiguration(arguments["--config"].(string))
|
||||
if err != nil {
|
||||
logrus.Panicf("Unable to start (reading config): %v", err)
|
||||
}
|
||||
|
||||
if immediate, ok := arguments["--immediate"]; ok {
|
||||
cfg.Immediate = immediate.(bool)
|
||||
}
|
||||
|
||||
if name := arguments["--name"]; name != nil {
|
||||
cfg.SystemName = name.(string)
|
||||
}
|
||||
logrus.SetOutput(getLogger(arguments["--log"]))
|
||||
|
||||
if len(os.Getenv("CACHET_API")) > 0 {
|
||||
cfg.API.URL = os.Getenv("CACHET_API")
|
||||
}
|
||||
if len(os.Getenv("CACHET_TOKEN")) > 0 {
|
||||
cfg.API.Token = os.Getenv("CACHET_TOKEN")
|
||||
}
|
||||
if len(os.Getenv("CACHET_DEV")) > 0 {
|
||||
logrus.SetLevel(logrus.DebugLevel)
|
||||
}
|
||||
|
||||
if valid := cfg.Validate(); !valid {
|
||||
logrus.Errorf("Invalid configuration")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
logrus.Debug("Configuration valid")
|
||||
logrus.Infof("System: %s", cfg.SystemName)
|
||||
logrus.Infof("API: %s", cfg.API.URL)
|
||||
logrus.Infof("Monitors: %d\n", len(cfg.Monitors))
|
||||
|
||||
logrus.Infof("Pinging cachet")
|
||||
if err := cfg.API.Ping(); err != nil {
|
||||
logrus.Errorf("Cannot ping cachet!\n%v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
logrus.Infof("Ping OK")
|
||||
|
||||
wg := &sync.WaitGroup{}
|
||||
for index, monitor := range cfg.Monitors {
|
||||
logrus.Infof("Starting Monitor #%d: ", index)
|
||||
logrus.Infof("Features: \n - %v", strings.Join(monitor.Describe(), "\n - "))
|
||||
|
||||
go monitor.ClockStart(cfg, monitor, wg)
|
||||
}
|
||||
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, os.Interrupt, os.Kill)
|
||||
<-signals
|
||||
|
||||
logrus.Warnf("Abort: Waiting monitors to finish")
|
||||
for _, mon := range cfg.Monitors {
|
||||
mon.GetMonitor().ClockStop()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func getLogger(logPath interface{}) *os.File {
|
||||
if logPath == nil || len(logPath.(string)) == 0 {
|
||||
return os.Stdout
|
||||
}
|
||||
|
||||
file, err := os.Create(logPath.(string))
|
||||
if err != nil {
|
||||
logrus.Errorf("Unable to open file '%v' for logging: \n%v", logPath, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
return file
|
||||
}
|
||||
|
||||
func getConfiguration(path string) (*cachet.CachetMonitor, error) {
|
||||
var cfg cachet.CachetMonitor
|
||||
var data []byte
|
||||
|
||||
// test if its a url
|
||||
url, err := url.ParseRequestURI(path)
|
||||
if err == nil && len(url.Scheme) > 0 {
|
||||
// download config
|
||||
response, err := http.Get(path)
|
||||
if err != nil {
|
||||
logrus.Warn("Unable to download network configuration")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer response.Body.Close()
|
||||
data, _ = ioutil.ReadAll(response.Body)
|
||||
|
||||
logrus.Info("Downloaded network configuration.")
|
||||
} else {
|
||||
data, err = ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, errors.New("Unable to open file: '" + path + "'")
|
||||
}
|
||||
}
|
||||
|
||||
if strings.HasSuffix(path, ".yaml") || strings.HasSuffix(path, ".yml") {
|
||||
err = yaml.Unmarshal(data, &cfg)
|
||||
} else {
|
||||
err = json.Unmarshal(data, &cfg)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logrus.Warnf("Unable to parse configuration file")
|
||||
}
|
||||
|
||||
cfg.Monitors = make([]cachet.MonitorInterface, len(cfg.RawMonitors))
|
||||
for index, rawMonitor := range cfg.RawMonitors {
|
||||
var t cachet.MonitorInterface
|
||||
var err error
|
||||
|
||||
// get default type
|
||||
monType := cachet.GetMonitorType("")
|
||||
if t, ok := rawMonitor["type"].(string); ok {
|
||||
monType = cachet.GetMonitorType(t)
|
||||
}
|
||||
|
||||
switch monType {
|
||||
case "http":
|
||||
var s cachet.HTTPMonitor
|
||||
err = mapstructure.Decode(rawMonitor, &s)
|
||||
t = &s
|
||||
case "dns":
|
||||
var s cachet.DNSMonitor
|
||||
err = mapstructure.Decode(rawMonitor, &s)
|
||||
t = &s
|
||||
default:
|
||||
logrus.Errorf("Invalid monitor type (index: %d) %v", index, monType)
|
||||
continue
|
||||
}
|
||||
|
||||
t.GetMonitor().Type = monType
|
||||
|
||||
if err != nil {
|
||||
logrus.Errorf("Unable to unmarshal monitor to type (index: %d): %v", index, err)
|
||||
continue
|
||||
}
|
||||
|
||||
cfg.Monitors[index] = t
|
||||
}
|
||||
|
||||
return &cfg, err
|
||||
Execute()
|
||||
}
|
||||
|
||||
117
cli/root.go
Normal file
117
cli/root.go
Normal file
@@ -0,0 +1,117 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
cachet "github.com/castawaylabs/cachet-monitor"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var cfgFile string
|
||||
|
||||
// rootCmd represents the base command when called without any subcommands
|
||||
var rootCmd = &cobra.Command{
|
||||
Use: "cmd",
|
||||
Short: "cachet-monitor",
|
||||
// Uncomment the following line if your bare application
|
||||
// has an action associated with it:
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
Action(cmd, args)
|
||||
},
|
||||
}
|
||||
|
||||
// Execute adds all child commands to the root command and sets flags appropriately.
|
||||
// This is called by main.main(). It only needs to happen once to the rootCmd.
|
||||
func Execute() {
|
||||
if err := rootCmd.Execute(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Here you will define your flags and configuration settings.
|
||||
// Cobra supports persistent flags, which, if defined here,
|
||||
// will be global for your application.
|
||||
pf := rootCmd.PersistentFlags()
|
||||
pf.StringVarP(&cfgFile, "config", "c", "", "config file (default is $(pwd)/config.yml)")
|
||||
pf.String("log", "", "log output")
|
||||
pf.String("format", "text", "log format [text/json]")
|
||||
pf.String("name", "", "machine name")
|
||||
pf.Bool("immediate", false, "Tick immediately (by default waits for first defined")
|
||||
}
|
||||
|
||||
func Action(cmd *cobra.Command, args []string) {
|
||||
cfg, err := cachet.New(cfgFile)
|
||||
if err != nil {
|
||||
logrus.Panicf("Unable to start (reading config): %v", err)
|
||||
}
|
||||
|
||||
if immediate, err := cmd.Flags().GetBool("immediate"); err == nil && immediate {
|
||||
cfg.Immediate = immediate
|
||||
}
|
||||
if name, err := cmd.Flags().GetString("name"); err == nil && len(name) > 0 {
|
||||
cfg.SystemName = name
|
||||
}
|
||||
|
||||
logrus.SetOutput(getLogger(cmd))
|
||||
if format, err := cmd.Flags().GetString("format"); err == nil && format == "json" {
|
||||
logrus.SetFormatter(&logrus.JSONFormatter{})
|
||||
}
|
||||
|
||||
if valid := cfg.Validate(); !valid {
|
||||
logrus.Errorf("Invalid configuration")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
logrus.Debug("Configuration valid")
|
||||
logrus.Infof("System: %s", cfg.SystemName)
|
||||
// logrus.Infof("API: %s", cfg.API.URL)
|
||||
logrus.Infof("Monitors: %d", len(cfg.Monitors))
|
||||
logrus.Infof("Backend: %v", strings.Join(cfg.Backend.Describe(), "\n - "))
|
||||
|
||||
logrus.Infof("Pinging backend")
|
||||
if err := cfg.Backend.Ping(); err != nil {
|
||||
logrus.Errorf("Cannot ping backend!\n%v", err)
|
||||
// os.Exit(1)
|
||||
}
|
||||
logrus.Infof("Ping OK")
|
||||
logrus.Warnf("Starting!")
|
||||
|
||||
wg := &sync.WaitGroup{}
|
||||
for index, monitor := range cfg.Monitors {
|
||||
logrus.Infof("Starting Monitor #%d: ", index)
|
||||
logrus.Infof("Features: \n - %v", strings.Join(monitor.Describe(), "\n - "))
|
||||
|
||||
go monitor.Start(monitor.GetTestFunc(), wg, cfg.Backend.Tick, cfg.Immediate)
|
||||
}
|
||||
|
||||
signals := make(chan os.Signal, 1)
|
||||
signal.Notify(signals, os.Interrupt, os.Kill)
|
||||
<-signals
|
||||
|
||||
logrus.Warnf("Abort: Waiting for monitors to finish")
|
||||
for _, mon := range cfg.Monitors {
|
||||
mon.GetMonitor().Stop()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func getLogger(cmd *cobra.Command) *os.File {
|
||||
logPath, _ := cmd.Flags().GetString("log")
|
||||
if len(logPath) == 0 {
|
||||
return os.Stdout
|
||||
}
|
||||
|
||||
file, err := os.Create(logPath)
|
||||
if err != nil {
|
||||
logrus.Errorf("Unable to open file '%v' for logging: \n%v", logPath, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
return file
|
||||
}
|
||||
144
config.go
144
config.go
@@ -1,24 +1,132 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/castawaylabs/cachet-monitor/backends"
|
||||
cachetbackend "github.com/castawaylabs/cachet-monitor/backends/cachet"
|
||||
"github.com/castawaylabs/cachet-monitor/monitors"
|
||||
|
||||
"github.com/mitchellh/mapstructure"
|
||||
"github.com/sirupsen/logrus"
|
||||
yaml "gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
type CachetMonitor struct {
|
||||
SystemName string `json:"system_name" yaml:"system_name"`
|
||||
DateFormat string `json:"date_format" yaml:"date_format"`
|
||||
API CachetAPI `json:"api"`
|
||||
RawMonitors []map[string]interface{} `json:"monitors" yaml:"monitors"`
|
||||
RawBackend map[string]interface{} `json:"backend" yaml:"backend"`
|
||||
|
||||
Monitors []MonitorInterface `json:"-" yaml:"-"`
|
||||
SystemName string `json:"system_name" yaml:"system_name"`
|
||||
Backend backends.BackendInterface `json:"-" yaml:"-"`
|
||||
Monitors []monitors.MonitorInterface `json:"-" yaml:"-"`
|
||||
Immediate bool `json:"-" yaml:"-"`
|
||||
}
|
||||
|
||||
func New(path string) (*CachetMonitor, error) {
|
||||
var cfg *CachetMonitor
|
||||
var data []byte
|
||||
|
||||
// test if its a url
|
||||
url, err := url.ParseRequestURI(path)
|
||||
if err == nil && len(url.Scheme) > 0 {
|
||||
// download config
|
||||
response, err := http.Get(path)
|
||||
if err != nil {
|
||||
logrus.Warn("Unable to download network configuration")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer response.Body.Close()
|
||||
data, _ = ioutil.ReadAll(response.Body)
|
||||
|
||||
logrus.Info("Downloaded network configuration.")
|
||||
} else {
|
||||
data, err = ioutil.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, errors.New("Unable to open file: '" + path + "'")
|
||||
}
|
||||
}
|
||||
|
||||
if strings.HasSuffix(path, ".yaml") || strings.HasSuffix(path, ".yml") {
|
||||
err = yaml.Unmarshal(data, &cfg)
|
||||
} else {
|
||||
err = json.Unmarshal(data, &cfg)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logrus.Warnf("Unable to parse configuration file")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// get default type
|
||||
if backend, ok := cfg.RawBackend["type"].(string); !ok {
|
||||
err = errors.New("Cannot determine backend type")
|
||||
} else {
|
||||
switch backend {
|
||||
case "cachet":
|
||||
var backend cachetbackend.CachetBackend
|
||||
err = mapstructure.Decode(cfg.RawBackend, &backend)
|
||||
cfg.Backend = &backend
|
||||
// backend.config = cfg
|
||||
default:
|
||||
err = errors.New("Invalid backend type: %v" + backend)
|
||||
}
|
||||
}
|
||||
|
||||
if errs := cfg.Backend.Validate(); len(errs) > 0 {
|
||||
logrus.Errorf("Backend validation errors: %v", errs)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logrus.Errorf("Unable to unmarshal backend: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cfg.Monitors = make([]monitors.MonitorInterface, len(cfg.RawMonitors))
|
||||
for index, rawMonitor := range cfg.RawMonitors {
|
||||
var t monitors.MonitorInterface
|
||||
|
||||
// get default type
|
||||
monType := GetMonitorType("")
|
||||
if t, ok := rawMonitor["type"].(string); ok {
|
||||
monType = GetMonitorType(t)
|
||||
}
|
||||
|
||||
switch monType {
|
||||
case "http":
|
||||
var mon monitors.HTTPMonitor
|
||||
err = mapstructure.Decode(rawMonitor, &mon)
|
||||
t = &mon
|
||||
case "dns":
|
||||
var mon monitors.DNSMonitor
|
||||
err = mapstructure.Decode(rawMonitor, &mon)
|
||||
t = &mon
|
||||
default:
|
||||
logrus.Errorf("Invalid monitor type (index: %d) %v", index, monType)
|
||||
continue
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logrus.Errorf("Unable to unmarshal monitor to type (index: %d): %v", index, err)
|
||||
continue
|
||||
}
|
||||
|
||||
mon := t.GetMonitor()
|
||||
mon.Type = monType
|
||||
cfg.Monitors[index] = t
|
||||
}
|
||||
|
||||
return cfg, err
|
||||
}
|
||||
|
||||
// Validate configuration
|
||||
func (cfg *CachetMonitor) Validate() bool {
|
||||
valid := true
|
||||
@@ -28,22 +136,13 @@ func (cfg *CachetMonitor) Validate() bool {
|
||||
cfg.SystemName = getHostname()
|
||||
}
|
||||
|
||||
if len(cfg.DateFormat) == 0 {
|
||||
cfg.DateFormat = DefaultTimeFormat
|
||||
}
|
||||
|
||||
if len(cfg.API.Token) == 0 || len(cfg.API.URL) == 0 {
|
||||
logrus.Warnf("API URL or API Token missing.\nGet help at https://github.com/castawaylabs/cachet-monitor")
|
||||
valid = false
|
||||
}
|
||||
|
||||
if len(cfg.Monitors) == 0 {
|
||||
logrus.Warnf("No monitors defined!\nSee help for example configuration")
|
||||
valid = false
|
||||
}
|
||||
|
||||
for index, monitor := range cfg.Monitors {
|
||||
if errs := monitor.Validate(); len(errs) > 0 {
|
||||
if errs := monitor.Validate(cfg.Backend.ValidateMonitor); len(errs) > 0 {
|
||||
logrus.Warnf("Monitor validation errors (index %d): %v", index, "\n - "+strings.Join(errs, "\n - "))
|
||||
valid = false
|
||||
}
|
||||
@@ -67,10 +166,6 @@ func getHostname() string {
|
||||
return addrs[0].String()
|
||||
}
|
||||
|
||||
func getMs() int64 {
|
||||
return time.Now().UnixNano() / int64(time.Millisecond)
|
||||
}
|
||||
|
||||
func GetMonitorType(t string) string {
|
||||
if len(t) == 0 {
|
||||
return "http"
|
||||
@@ -78,12 +173,3 @@ func GetMonitorType(t string) string {
|
||||
|
||||
return strings.ToLower(t)
|
||||
}
|
||||
|
||||
func getTemplateData(monitor *AbstractMonitor) map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"SystemName": monitor.config.SystemName,
|
||||
"API": monitor.config.API,
|
||||
"Monitor": monitor,
|
||||
"now": time.Now().Format(monitor.config.DateFormat),
|
||||
}
|
||||
}
|
||||
|
||||
112
incident.go
112
incident.go
@@ -1,112 +0,0 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
)
|
||||
|
||||
// Incident Cachet data model
|
||||
type Incident struct {
|
||||
ID int `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Message string `json:"message"`
|
||||
Status int `json:"status"`
|
||||
Visible int `json"visible"`
|
||||
Notify bool `json:"notify"`
|
||||
|
||||
ComponentID int `json:"component_id"`
|
||||
ComponentStatus int `json:"component_status"`
|
||||
}
|
||||
|
||||
// Send - Create or Update incident
|
||||
func (incident *Incident) Send(cfg *CachetMonitor) error {
|
||||
switch incident.Status {
|
||||
case 1, 2, 3:
|
||||
// partial outage
|
||||
incident.ComponentStatus = 3
|
||||
|
||||
componentStatus, err := incident.GetComponentStatus(cfg)
|
||||
if componentStatus == 3 {
|
||||
// major outage
|
||||
incident.ComponentStatus = 4
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logrus.Warnf("cannot fetch component: %v", err)
|
||||
}
|
||||
case 4:
|
||||
// fixed
|
||||
incident.ComponentStatus = 1
|
||||
}
|
||||
|
||||
requestType := "POST"
|
||||
requestURL := "/incidents"
|
||||
if incident.ID > 0 {
|
||||
requestType = "PUT"
|
||||
requestURL += "/" + strconv.Itoa(incident.ID)
|
||||
}
|
||||
|
||||
jsonBytes, _ := json.Marshal(incident)
|
||||
|
||||
resp, body, err := cfg.API.NewRequest(requestType, requestURL, jsonBytes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var data struct {
|
||||
ID int `json:"id"`
|
||||
}
|
||||
if err := json.Unmarshal(body.Data, &data); err != nil {
|
||||
return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body.Data))
|
||||
}
|
||||
|
||||
incident.ID = data.ID
|
||||
if resp.StatusCode != 200 {
|
||||
return fmt.Errorf("Could not create/update incident!")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (incident *Incident) GetComponentStatus(cfg *CachetMonitor) (int, error) {
|
||||
resp, body, err := cfg.API.NewRequest("GET", "/components/"+strconv.Itoa(incident.ComponentID), nil)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return 0, fmt.Errorf("Invalid status code. Received %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var data struct {
|
||||
Status int `json:"status,string"`
|
||||
}
|
||||
if err := json.Unmarshal(body.Data, &data); err != nil {
|
||||
return 0, fmt.Errorf("Cannot parse component body: %v. Err = %v", string(body.Data), err)
|
||||
}
|
||||
|
||||
return data.Status, nil
|
||||
}
|
||||
|
||||
// SetInvestigating sets status to Investigating
|
||||
func (incident *Incident) SetInvestigating() {
|
||||
incident.Status = 1
|
||||
}
|
||||
|
||||
// SetIdentified sets status to Identified
|
||||
func (incident *Incident) SetIdentified() {
|
||||
incident.Status = 2
|
||||
}
|
||||
|
||||
// SetWatching sets status to Watching
|
||||
func (incident *Incident) SetWatching() {
|
||||
incident.Status = 3
|
||||
}
|
||||
|
||||
// SetFixed sets status to Fixed
|
||||
func (incident *Incident) SetFixed() {
|
||||
incident.Status = 4
|
||||
}
|
||||
10
make.sh
Executable file
10
make.sh
Executable file
@@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
if [ "$1" == "test" ]; then
|
||||
reflex -r '\.go$' -s -d none -- sh -c 'go test ./...'
|
||||
fi
|
||||
|
||||
reflex -r '\.go$' -s -d none -- sh -c 'go build -o ./cachet-monitor ./cli/ && ./cachet-monitor -c config.yml'
|
||||
exit 0
|
||||
252
monitor.go
252
monitor.go
@@ -1,252 +0,0 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
)
|
||||
|
||||
const DefaultInterval = time.Second * 60
|
||||
const DefaultTimeout = time.Second
|
||||
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
|
||||
const HistorySize = 10
|
||||
|
||||
type MonitorInterface interface {
|
||||
ClockStart(*CachetMonitor, MonitorInterface, *sync.WaitGroup)
|
||||
ClockStop()
|
||||
tick(MonitorInterface)
|
||||
test() bool
|
||||
|
||||
Validate() []string
|
||||
GetMonitor() *AbstractMonitor
|
||||
Describe() []string
|
||||
}
|
||||
|
||||
// AbstractMonitor data model
|
||||
type AbstractMonitor struct {
|
||||
Name string
|
||||
Target string
|
||||
|
||||
// (default)http / dns
|
||||
Type string
|
||||
Strict bool
|
||||
|
||||
Interval time.Duration
|
||||
Timeout time.Duration
|
||||
|
||||
MetricID int `mapstructure:"metric_id"`
|
||||
ComponentID int `mapstructure:"component_id"`
|
||||
|
||||
// Templating stuff
|
||||
Template struct {
|
||||
Investigating MessageTemplate
|
||||
Fixed MessageTemplate
|
||||
}
|
||||
|
||||
// Threshold = percentage / number of down incidents
|
||||
Threshold float32
|
||||
ThresholdCount bool `mapstructure:"threshold_count"`
|
||||
|
||||
// lag / average(lagHistory) * 100 = percentage above average lag
|
||||
// PerformanceThreshold sets the % limit above which this monitor will trigger degraded-performance
|
||||
// PerformanceThreshold float32
|
||||
|
||||
history []bool
|
||||
// lagHistory []float32
|
||||
lastFailReason string
|
||||
incident *Incident
|
||||
config *CachetMonitor
|
||||
|
||||
// Closed when mon.Stop() is called
|
||||
stopC chan bool
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Validate() []string {
|
||||
errs := []string{}
|
||||
|
||||
if len(mon.Name) == 0 {
|
||||
errs = append(errs, "Name is required")
|
||||
}
|
||||
|
||||
if mon.Interval < 1 {
|
||||
mon.Interval = DefaultInterval
|
||||
}
|
||||
if mon.Timeout < 1 {
|
||||
mon.Timeout = DefaultTimeout
|
||||
}
|
||||
|
||||
if mon.Timeout > mon.Interval {
|
||||
errs = append(errs, "Timeout greater than interval")
|
||||
}
|
||||
|
||||
if mon.ComponentID == 0 && mon.MetricID == 0 {
|
||||
errs = append(errs, "component_id & metric_id are unset")
|
||||
}
|
||||
|
||||
if mon.Threshold <= 0 {
|
||||
mon.Threshold = 100
|
||||
}
|
||||
|
||||
if err := mon.Template.Fixed.Compile(); err != nil {
|
||||
errs = append(errs, "Could not compile \"fixed\" template: "+err.Error())
|
||||
}
|
||||
if err := mon.Template.Investigating.Compile(); err != nil {
|
||||
errs = append(errs, "Could not compile \"investigating\" template: "+err.Error())
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor {
|
||||
return mon
|
||||
}
|
||||
func (mon *AbstractMonitor) Describe() []string {
|
||||
features := []string{"Type: " + mon.Type}
|
||||
|
||||
if len(mon.Name) > 0 {
|
||||
features = append(features, "Name: "+mon.Name)
|
||||
}
|
||||
|
||||
return features
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) ClockStart(cfg *CachetMonitor, iface MonitorInterface, wg *sync.WaitGroup) {
|
||||
wg.Add(1)
|
||||
mon.config = cfg
|
||||
mon.stopC = make(chan bool)
|
||||
if cfg.Immediate {
|
||||
mon.tick(iface)
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(mon.Interval * time.Second)
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
mon.tick(iface)
|
||||
case <-mon.stopC:
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) ClockStop() {
|
||||
select {
|
||||
case <-mon.stopC:
|
||||
return
|
||||
default:
|
||||
close(mon.stopC)
|
||||
}
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) test() bool { return false }
|
||||
|
||||
func (mon *AbstractMonitor) tick(iface MonitorInterface) {
|
||||
reqStart := getMs()
|
||||
up := iface.test()
|
||||
lag := getMs() - reqStart
|
||||
|
||||
histSize := HistorySize
|
||||
if mon.ThresholdCount {
|
||||
histSize = int(mon.Threshold)
|
||||
}
|
||||
|
||||
if len(mon.history) == histSize-1 {
|
||||
logrus.Warnf("%v is now saturated", mon.Name)
|
||||
}
|
||||
if len(mon.history) >= histSize {
|
||||
mon.history = mon.history[len(mon.history)-(histSize-1):]
|
||||
}
|
||||
mon.history = append(mon.history, up)
|
||||
mon.AnalyseData()
|
||||
|
||||
// report lag
|
||||
if mon.MetricID > 0 {
|
||||
go mon.config.API.SendMetric(mon.MetricID, lag)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
|
||||
func (mon *AbstractMonitor) AnalyseData() {
|
||||
// look at the past few incidents
|
||||
numDown := 0
|
||||
for _, wasUp := range mon.history {
|
||||
if wasUp == false {
|
||||
numDown++
|
||||
}
|
||||
}
|
||||
|
||||
t := (float32(numDown) / float32(len(mon.history))) * 100
|
||||
l := logrus.WithFields(logrus.Fields{
|
||||
"monitor": mon.Name,
|
||||
"time": time.Now().Format(mon.config.DateFormat),
|
||||
})
|
||||
if numDown == 0 {
|
||||
l.Printf("monitor is up")
|
||||
} else if mon.ThresholdCount {
|
||||
l.Printf("monitor down %d/%d", numDown, int(mon.Threshold))
|
||||
} else {
|
||||
l.Printf("monitor down %.2f%%/%.2f%%", t, mon.Threshold)
|
||||
}
|
||||
|
||||
histSize := HistorySize
|
||||
if mon.ThresholdCount {
|
||||
histSize = int(mon.Threshold)
|
||||
}
|
||||
|
||||
if len(mon.history) != histSize {
|
||||
// not saturated
|
||||
return
|
||||
}
|
||||
|
||||
triggered := (mon.ThresholdCount && numDown == int(mon.Threshold)) || (!mon.ThresholdCount && t > mon.Threshold)
|
||||
|
||||
if triggered && mon.incident == nil {
|
||||
// create incident
|
||||
tplData := getTemplateData(mon)
|
||||
tplData["FailReason"] = mon.lastFailReason
|
||||
|
||||
subject, message := mon.Template.Investigating.Exec(tplData)
|
||||
mon.incident = &Incident{
|
||||
Name: subject,
|
||||
ComponentID: mon.ComponentID,
|
||||
Message: message,
|
||||
Notify: true,
|
||||
}
|
||||
|
||||
// is down, create an incident
|
||||
l.Warnf("creating incident. Monitor is down: %v", mon.lastFailReason)
|
||||
// set investigating status
|
||||
mon.incident.SetInvestigating()
|
||||
// create/update incident
|
||||
if err := mon.incident.Send(mon.config); err != nil {
|
||||
l.Printf("Error sending incident: %v", err)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// still triggered or no incident
|
||||
if triggered || mon.incident == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// was down, created an incident, its now ok, make it resolved.
|
||||
l.Warn("Resolving incident")
|
||||
|
||||
// resolve incident
|
||||
tplData := getTemplateData(mon)
|
||||
tplData["incident"] = mon.incident
|
||||
|
||||
subject, message := mon.Template.Fixed.Exec(tplData)
|
||||
mon.incident.Name = subject
|
||||
mon.incident.Message = message
|
||||
mon.incident.SetFixed()
|
||||
if err := mon.incident.Send(mon.config); err != nil {
|
||||
l.Printf("Error sending incident: %v", err)
|
||||
}
|
||||
|
||||
mon.lastFailReason = ""
|
||||
mon.incident = nil
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
package cachet
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestAnalyseData(t *testing.T) {}
|
||||
@@ -1,14 +1,33 @@
|
||||
package cachet
|
||||
package monitors
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/miekg/dns"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// Investigating template
|
||||
var defaultDNSInvestigatingTpl = MessageTemplate{
|
||||
Subject: `{{ .Monitor.Name }} - {{ .SystemName }}`,
|
||||
Message: `{{ .Monitor.Name }} DNS check **failed** (server time: {{ .now }})
|
||||
|
||||
{{ .FailReason }}`,
|
||||
}
|
||||
|
||||
// Fixed template
|
||||
var defaultDNSFixedTpl = MessageTemplate{
|
||||
Subject: `{{ .Monitor.Name }} - {{ .SystemName }}`,
|
||||
Message: `**Resolved** - {{ .now }}
|
||||
|
||||
- - -
|
||||
|
||||
{{ .incident.Message }}`,
|
||||
}
|
||||
|
||||
type DNSAnswer struct {
|
||||
Regex string
|
||||
regexp *regexp.Regexp
|
||||
@@ -28,8 +47,11 @@ type DNSMonitor struct {
|
||||
Answers []DNSAnswer
|
||||
}
|
||||
|
||||
func (monitor *DNSMonitor) Validate() []string {
|
||||
errs := monitor.AbstractMonitor.Validate()
|
||||
func (monitor *DNSMonitor) Validate(validate backendValidateFunc) []string {
|
||||
monitor.Template.Investigating.SetDefault(defaultDNSInvestigatingTpl)
|
||||
monitor.Template.Fixed.SetDefault(defaultDNSFixedTpl)
|
||||
|
||||
errs := monitor.AbstractMonitor.Validate(validate)
|
||||
|
||||
if len(monitor.DNS) == 0 {
|
||||
config, _ := dns.ClientConfigFromFile("/etc/resolv.conf")
|
||||
@@ -61,7 +83,7 @@ func (monitor *DNSMonitor) Validate() []string {
|
||||
return errs
|
||||
}
|
||||
|
||||
func (monitor *DNSMonitor) test() bool {
|
||||
func (monitor *DNSMonitor) test() (bool, []error) {
|
||||
m := new(dns.Msg)
|
||||
m.SetQuestion(dns.Fqdn(monitor.Target), monitor.question)
|
||||
m.RecursionDesired = true
|
||||
@@ -70,11 +92,11 @@ func (monitor *DNSMonitor) test() bool {
|
||||
r, _, err := c.Exchange(m, monitor.DNS)
|
||||
if err != nil {
|
||||
logrus.Warnf("DNS error: %v", err)
|
||||
return false
|
||||
return false, []error{err}
|
||||
}
|
||||
|
||||
if r.Rcode != dns.RcodeSuccess {
|
||||
return false
|
||||
return false, []error{errors.New("Invalid status code returned")}
|
||||
}
|
||||
|
||||
for _, check := range monitor.Answers {
|
||||
@@ -88,11 +110,11 @@ func (monitor *DNSMonitor) test() bool {
|
||||
|
||||
if !found {
|
||||
logrus.Warnf("DNS check failed: %v. Not found in any of %v", check, r.Answer)
|
||||
return false
|
||||
return false, []error{errors.New("Record not found")}
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func findDNSType(t string) uint16 {
|
||||
@@ -1,7 +1,8 @@
|
||||
package cachet
|
||||
package monitors
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"regexp"
|
||||
@@ -13,7 +14,7 @@ import (
|
||||
// Investigating template
|
||||
var defaultHTTPInvestigatingTpl = MessageTemplate{
|
||||
Subject: `{{ .Monitor.Name }} - {{ .SystemName }}`,
|
||||
Message: `{{ .Monitor.Name }} check **failed** (server time: {{ .now }})
|
||||
Message: `{{ .Monitor.Name }} HTTP check **failed** (server time: {{ .now }})
|
||||
|
||||
{{ .FailReason }}`,
|
||||
}
|
||||
@@ -41,7 +42,7 @@ type HTTPMonitor struct {
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
func (monitor *HTTPMonitor) test() bool {
|
||||
func (monitor *HTTPMonitor) test() (bool, []error) {
|
||||
req, err := http.NewRequest(monitor.Method, monitor.Target, nil)
|
||||
for k, v := range monitor.Headers {
|
||||
req.Header.Add(k, v)
|
||||
@@ -56,40 +57,38 @@ func (monitor *HTTPMonitor) test() bool {
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
monitor.lastFailReason = err.Error()
|
||||
return false
|
||||
return false, []error{err}
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
if monitor.ExpectedStatusCode > 0 && resp.StatusCode != monitor.ExpectedStatusCode {
|
||||
monitor.lastFailReason = "Expected HTTP response status: " + strconv.Itoa(monitor.ExpectedStatusCode) + ", got: " + strconv.Itoa(resp.StatusCode)
|
||||
return false
|
||||
fail := "Expected HTTP response status: " + strconv.Itoa(monitor.ExpectedStatusCode) + ", got: " + strconv.Itoa(resp.StatusCode)
|
||||
return false, []error{errors.New(fail)}
|
||||
}
|
||||
|
||||
if monitor.bodyRegexp != nil {
|
||||
// check response body
|
||||
responseBody, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
monitor.lastFailReason = err.Error()
|
||||
return false
|
||||
return false, []error{err}
|
||||
}
|
||||
|
||||
if !monitor.bodyRegexp.Match(responseBody) {
|
||||
monitor.lastFailReason = "Unexpected body: " + string(responseBody) + ".\nExpected to match: " + monitor.ExpectedBody
|
||||
return false
|
||||
fail := "Unexpected body: " + string(responseBody) + ".\nExpected to match: " + monitor.ExpectedBody
|
||||
return false, []error{errors.New(fail)}
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
func (mon *HTTPMonitor) Validate() []string {
|
||||
func (mon *HTTPMonitor) Validate(validate backendValidateFunc) []string {
|
||||
mon.Template.Investigating.SetDefault(defaultHTTPInvestigatingTpl)
|
||||
mon.Template.Fixed.SetDefault(defaultHTTPFixedTpl)
|
||||
|
||||
errs := mon.AbstractMonitor.Validate()
|
||||
errs := mon.AbstractMonitor.Validate(validate)
|
||||
|
||||
if len(mon.ExpectedBody) > 0 {
|
||||
exp, err := regexp.Compile(mon.ExpectedBody)
|
||||
257
monitors/monitor.go
Normal file
257
monitors/monitor.go
Normal file
@@ -0,0 +1,257 @@
|
||||
package monitors
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const DefaultInterval = time.Second * 60
|
||||
const DefaultTimeout = time.Second
|
||||
const HistorySize = 10
|
||||
|
||||
type MonitorStatus string
|
||||
|
||||
const (
|
||||
MonitorStatusUp = "up"
|
||||
MonitorStatusDown = "down"
|
||||
MonitorStatusNotSaturated = "unsaturated"
|
||||
)
|
||||
|
||||
type backendValidateFunc = func(monitor *AbstractMonitor) []string
|
||||
type MonitorTestFunc func() (up bool, errs []error)
|
||||
type MonitorTickFunc func(monitor MonitorInterface, status MonitorStatus, errs []error, lag int64)
|
||||
|
||||
type MonitorInterface interface {
|
||||
Start(MonitorTestFunc, *sync.WaitGroup, MonitorTickFunc, bool)
|
||||
Stop()
|
||||
|
||||
tick(MonitorTestFunc) (status MonitorStatus, errors []error, lag int64)
|
||||
test() (bool, []error)
|
||||
|
||||
Validate(validate backendValidateFunc) []string
|
||||
Describe() []string
|
||||
|
||||
GetMonitor() *AbstractMonitor
|
||||
GetTestFunc() MonitorTestFunc
|
||||
GetLastStatus() MonitorStatus
|
||||
UpdateLastStatus(status MonitorStatus) (old MonitorStatus)
|
||||
}
|
||||
|
||||
// AbstractMonitor data model
|
||||
type AbstractMonitor struct {
|
||||
Name string
|
||||
Target string
|
||||
|
||||
// (default)http / dns
|
||||
Type string
|
||||
Strict bool
|
||||
|
||||
Interval time.Duration
|
||||
Timeout time.Duration
|
||||
Params map[string]interface{}
|
||||
|
||||
// Templating stuff
|
||||
Template MonitorTemplates
|
||||
|
||||
// Threshold = percentage / number of down incidents
|
||||
Threshold float32
|
||||
ThresholdCount bool `mapstructure:"threshold_count"`
|
||||
|
||||
// lag / average(lagHistory) * 100 = percentage above average lag
|
||||
// PerformanceThreshold sets the % limit above which this monitor will trigger degraded-performance
|
||||
// PerformanceThreshold float32
|
||||
|
||||
history []bool
|
||||
lastStatus MonitorStatus
|
||||
|
||||
// Closed when mon.Stop() is called
|
||||
stopC chan bool
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Validate(validate backendValidateFunc) []string {
|
||||
errs := []string{}
|
||||
|
||||
if len(mon.Name) == 0 {
|
||||
errs = append(errs, "Name is required")
|
||||
}
|
||||
|
||||
if mon.Interval < 1 {
|
||||
mon.Interval = DefaultInterval
|
||||
}
|
||||
if mon.Timeout < 1 {
|
||||
mon.Timeout = DefaultTimeout
|
||||
}
|
||||
|
||||
if mon.Timeout > mon.Interval {
|
||||
errs = append(errs, "Timeout greater than interval")
|
||||
}
|
||||
|
||||
// get the backend to validate the monitor
|
||||
errs = append(errs, validate(mon)...)
|
||||
|
||||
if mon.Threshold <= 0 {
|
||||
mon.Threshold = 100
|
||||
}
|
||||
|
||||
// if len(mon.Template.Fixed.Message) == 0 || len(mon.Template.Fixed.Subject) == 0 {
|
||||
// errs = append(errs, "\"fixed\" template empty/missing")
|
||||
// }
|
||||
// if len(mon.Template.Investigating.Message) == 0 || len(mon.Template.Investigating.Subject) == 0 {
|
||||
// errs = append(errs, "\"investigating\" template empty/missing")
|
||||
// }
|
||||
if err := mon.Template.Fixed.Compile(); err != nil {
|
||||
errs = append(errs, "Could not compile \"fixed\" template: "+err.Error())
|
||||
}
|
||||
if err := mon.Template.Investigating.Compile(); err != nil {
|
||||
errs = append(errs, "Could not compile \"investigating\" template: "+err.Error())
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor {
|
||||
return mon
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Describe() []string {
|
||||
features := []string{"Type: " + mon.Type}
|
||||
|
||||
if len(mon.Name) > 0 {
|
||||
features = append(features, "Name: "+mon.Name)
|
||||
}
|
||||
|
||||
return features
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Start(testFunc MonitorTestFunc, wg *sync.WaitGroup, tickFunc MonitorTickFunc, immediate bool) {
|
||||
wg.Add(1)
|
||||
|
||||
mon.stopC = make(chan bool)
|
||||
if immediate {
|
||||
status, errs, lag := mon.tick(testFunc)
|
||||
tickFunc(mon, status, errs, lag)
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(mon.Interval * time.Second)
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
status, errs, lag := mon.tick(testFunc)
|
||||
tickFunc(mon, status, errs, lag)
|
||||
case <-mon.stopC:
|
||||
wg.Done()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) Stop() {
|
||||
select {
|
||||
case <-mon.stopC:
|
||||
return
|
||||
default:
|
||||
close(mon.stopC)
|
||||
}
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) tick(testFunc MonitorTestFunc) (status MonitorStatus, errors []error, lag int64) {
|
||||
reqStart := getMs()
|
||||
up, errs := testFunc()
|
||||
lag = getMs() - reqStart
|
||||
|
||||
histSize := HistorySize
|
||||
if mon.ThresholdCount {
|
||||
histSize = int(mon.Threshold)
|
||||
}
|
||||
|
||||
if len(mon.history) == histSize-1 {
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"monitor": mon.Name,
|
||||
}).Warn("monitor saturated")
|
||||
}
|
||||
if len(mon.history) >= histSize {
|
||||
mon.history = mon.history[len(mon.history)-(histSize-1):]
|
||||
}
|
||||
mon.history = append(mon.history, up)
|
||||
status = mon.GetStatus()
|
||||
errors = errs
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// TODO: test
|
||||
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
|
||||
func (mon *AbstractMonitor) GetStatus() MonitorStatus {
|
||||
numDown := 0
|
||||
for _, wasUp := range mon.history {
|
||||
if wasUp == false {
|
||||
numDown++
|
||||
}
|
||||
}
|
||||
|
||||
t := (float32(numDown) / float32(len(mon.history))) * 100
|
||||
logFields := logrus.Fields{"monitor": mon.Name}
|
||||
// stop reporting time for jsonformatter, it's there by default
|
||||
if _, ok := logrus.StandardLogger().Formatter.(*logrus.JSONFormatter); !ok {
|
||||
logFields["t"] = time.Now()
|
||||
}
|
||||
l := logrus.WithFields(logFields)
|
||||
|
||||
symbol := "⚠️"
|
||||
if t == 100 {
|
||||
symbol = "❌"
|
||||
}
|
||||
if numDown == 0 {
|
||||
l.Printf("👍 up")
|
||||
} else if mon.ThresholdCount {
|
||||
l.Printf("%v down (%d/%d)", symbol, numDown, int(mon.Threshold))
|
||||
} else {
|
||||
l.Printf("%v down %.0f%%/%.0f%%", symbol, t, mon.Threshold)
|
||||
}
|
||||
|
||||
histSize := HistorySize
|
||||
if mon.ThresholdCount {
|
||||
histSize = int(mon.Threshold)
|
||||
}
|
||||
|
||||
if len(mon.history) != histSize {
|
||||
// not saturated
|
||||
return MonitorStatusNotSaturated
|
||||
}
|
||||
|
||||
var down bool
|
||||
if mon.ThresholdCount {
|
||||
down = numDown >= int(mon.Threshold)
|
||||
} else {
|
||||
down = t >= mon.Threshold
|
||||
}
|
||||
|
||||
if !down {
|
||||
return MonitorStatusUp
|
||||
}
|
||||
|
||||
return MonitorStatusDown
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) GetTestFunc() MonitorTestFunc {
|
||||
return mon.test
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) GetLastStatus() MonitorStatus {
|
||||
return mon.lastStatus
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) UpdateLastStatus(status MonitorStatus) (old MonitorStatus) {
|
||||
old = mon.lastStatus
|
||||
mon.lastStatus = status
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (mon *AbstractMonitor) test() (bool, []error) { return false, nil }
|
||||
|
||||
func getMs() int64 {
|
||||
return time.Now().UnixNano() / int64(time.Millisecond)
|
||||
}
|
||||
@@ -1,10 +1,15 @@
|
||||
package cachet
|
||||
package monitors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"text/template"
|
||||
)
|
||||
|
||||
type MonitorTemplates struct {
|
||||
Investigating MessageTemplate
|
||||
Fixed MessageTemplate
|
||||
}
|
||||
|
||||
type MessageTemplate struct {
|
||||
Subject string `json:"subject"`
|
||||
Message string `json:"message"`
|
||||
Reference in New Issue
Block a user