From 162d55b3f38bb20f80f998ee2e639894324da236 Mon Sep 17 00:00:00 2001 From: Matej Kramny Date: Wed, 20 Feb 2019 11:14:45 +0800 Subject: [PATCH] huge refactor - extendable backends - better project structure - better cli interface --- .gitignore | 4 +- api.go | 85 -------- backends/cachet/backend.go | 288 ++++++++++++++++++++++++++++ backends/cachet/component.go | 14 ++ backends/cachet/incident.go | 71 +++++++ backends/interface.go | 19 ++ cli/main.go | 195 +------------------ cli/root.go | 117 +++++++++++ config.go | 146 +++++++++++--- incident.go | 112 ----------- make.sh | 10 + monitor.go | 252 ------------------------ monitor_test.go | 7 - dns.go => monitors/dns.go | 40 +++- http.go => monitors/http.go | 27 ++- monitors/monitor.go | 257 +++++++++++++++++++++++++ template.go => monitors/template.go | 7 +- 17 files changed, 946 insertions(+), 705 deletions(-) delete mode 100644 api.go create mode 100644 backends/cachet/backend.go create mode 100644 backends/cachet/component.go create mode 100644 backends/cachet/incident.go create mode 100644 backends/interface.go create mode 100644 cli/root.go delete mode 100644 incident.go create mode 100755 make.sh delete mode 100644 monitor.go delete mode 100644 monitor_test.go rename dns.go => monitors/dns.go (67%) rename http.go => monitors/http.go (78%) create mode 100644 monitors/monitor.go rename template.go => monitors/template.go (90%) diff --git a/.gitignore b/.gitignore index d969c09..47c5d8a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ /config.yml /config.json -examples/ \ No newline at end of file +examples/ +vendor/ +cachet-monitor diff --git a/api.go b/api.go deleted file mode 100644 index 397c0e9..0000000 --- a/api.go +++ /dev/null @@ -1,85 +0,0 @@ -package cachet - -import ( - "bytes" - "crypto/tls" - "encoding/json" - "errors" - "net/http" - "strconv" - "time" - - "github.com/Sirupsen/logrus" -) - -type CachetAPI struct { - URL string `json:"url"` - Token string `json:"token"` - Insecure bool `json:"insecure"` -} - -type CachetResponse struct { - Data json.RawMessage `json:"data"` -} - -// TODO: test -func (api CachetAPI) Ping() error { - resp, _, err := api.NewRequest("GET", "/ping", nil) - if err != nil { - return err - } - - if resp.StatusCode != 200 { - return errors.New("API Responded with non-200 status code") - } - - defer resp.Body.Close() - - return nil -} - -// SendMetric adds a data point to a cachet monitor -func (api CachetAPI) SendMetric(id int, lag int64) { - logrus.Debugf("Sending lag metric ID:%d RTT %vms", id, lag) - - jsonBytes, _ := json.Marshal(map[string]interface{}{ - "value": lag, - "timestamp": time.Now().Unix(), - }) - - resp, _, err := api.NewRequest("POST", "/metrics/"+strconv.Itoa(id)+"/points", jsonBytes) - if err != nil || resp.StatusCode != 200 { - logrus.Warnf("Could not log metric! ID: %d, err: %v", id, err) - } - - defer resp.Body.Close() -} - -// TODO: test -// NewRequest wraps http.NewRequest -func (api CachetAPI) NewRequest(requestType, url string, reqBody []byte) (*http.Response, CachetResponse, error) { - req, err := http.NewRequest(requestType, api.URL+url, bytes.NewBuffer(reqBody)) - - req.Header.Set("Content-Type", "application/json") - req.Header.Set("X-Cachet-Token", api.Token) - - transport := http.DefaultTransport.(*http.Transport) - transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: api.Insecure} - client := &http.Client{ - Transport: transport, - } - - res, err := client.Do(req) - if err != nil { - return nil, CachetResponse{}, err - } - - var body struct { - Data json.RawMessage `json:"data"` - } - err = json.NewDecoder(res.Body).Decode(&body) - - defer req.Body.Close() - - return res, body, err -} diff --git a/backends/cachet/backend.go b/backends/cachet/backend.go new file mode 100644 index 0000000..52daa0d --- /dev/null +++ b/backends/cachet/backend.go @@ -0,0 +1,288 @@ +package cachetbackend + +import ( + "bytes" + "crypto/tls" + "encoding/json" + "errors" + "fmt" + "net/http" + "strconv" + "strings" + "time" + + "github.com/castawaylabs/cachet-monitor/monitors" + "github.com/sirupsen/logrus" +) + +const DefaultTimeFormat = "15:04:05 Jan 2 MST" + +type CachetBackend struct { + URL string `json:"url" yaml:"url"` + Token string `json:"token" yaml:"token"` + Insecure bool `json:"insecure" yaml:"insecure"` + DateFormat string `json:"date_format" yaml:"date_format"` +} + +type CachetResponse struct { + Data json.RawMessage `json:"data"` +} + +func (api CachetBackend) ValidateMonitor(mon *monitors.AbstractMonitor) []string { + errs := []string{} + + params := mon.Params + + componentID, componentIDOk := params["component_id"] + metricID, metricIDOk := params["metric_id"] + if !componentIDOk && !metricIDOk { + errs = append(errs, "component_id and metric_id is unset") + } + + if _, ok := componentID.(int); !ok && componentIDOk { + errs = append(errs, "component_id not integer") + } + if _, ok := metricID.(int); !ok && metricIDOk { + errs = append(errs, "metric_id not integer") + } + + return errs +} + +func (api CachetBackend) Validate() []string { + errs := []string{} + + if len(api.URL) == 0 { + errs = append(errs, "Cachet API URL invalid") + } + if len(api.Token) == 0 { + errs = append(errs, "Cachet API Token invalid") + } + + if len(api.DateFormat) == 0 { + api.DateFormat = DefaultTimeFormat + } + + return errs +} + +// TODO: test +func (api CachetBackend) Ping() error { + resp, _, err := api.NewRequest("GET", "/ping", nil) + if err != nil { + return err + } + + if resp.StatusCode != 200 { + return errors.New("API Responded with non-200 status code") + } + + defer resp.Body.Close() + + return nil +} + +// TODO: test +// NewRequest wraps http.NewRequest +func (api CachetBackend) NewRequest(requestType, url string, reqBody []byte) (*http.Response, interface{}, error) { + req, err := http.NewRequest(requestType, api.URL+url, bytes.NewBuffer(reqBody)) + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("X-Cachet-Token", api.Token) + + transport := http.DefaultTransport.(*http.Transport) + transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: api.Insecure} + client := &http.Client{ + Transport: transport, + } + + res, err := client.Do(req) + if err != nil { + return nil, CachetResponse{}, err + } + defer res.Body.Close() + defer req.Body.Close() + + var body struct { + Data json.RawMessage `json:"data"` + } + err = json.NewDecoder(res.Body).Decode(&body) + + return res, body, err +} + +func (mon CachetBackend) Describe() []string { + features := []string{"Cachet API"} + + return features +} + +func (api CachetBackend) SendMetric(monitor monitors.MonitorInterface, lag int64) error { + mon := monitor.GetMonitor() + if _, ok := mon.Params["metric_id"]; !ok { + return nil + } + + metricID := mon.Params["metric_id"].(int) + + // report lag + logrus.Debugf("Sending lag metric ID: %d RTT %vms", metricID, lag) + + jsonBytes, _ := json.Marshal(map[string]interface{}{ + "value": lag, + "timestamp": time.Now().Unix(), + }) + + resp, _, err := api.NewRequest("POST", "/metrics/"+strconv.Itoa(metricID)+"/points", jsonBytes) + if err != nil || resp.StatusCode != 200 { + logrus.Warnf("Could not log metric! ID: %d, err: %v", metricID, err) + } + + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } + + return nil +} + +func (api CachetBackend) UpdateMonitor(mon monitors.MonitorInterface, status, previousStatus monitors.MonitorStatus, errs []error) error { + monitor := mon.GetMonitor() + l := logrus.WithFields(logrus.Fields{ + "monitor": monitor.Name, + "time": time.Now().Format(api.DateFormat), + }) + + errors := make([]string, len(errs)) + for i, err := range errs { + errors[i] = err.Error() + } + + fmt.Println("errs", errs) + + componentID := monitor.Params["component_id"].(int) + incident, err := api.findIncident(componentID) + if err != nil { + l.Errorf("Couldn't find existing incidents: %v", err) + } + + if incident == nil { + // create a new one + incident = &Incident{ + Name: "", + ComponentID: componentID, + Message: "", + Notify: true, + } + } else { + // find component status + component, err := api.getComponent(incident.ComponentID) + if err != nil { + panic(err) + } + + incident.ComponentStatus = component.Status + } + + tpls := monitor.Template + tplData := api.getTemplateData(monitor) + var tpl monitors.MessageTemplate + + if status == monitors.MonitorStatusDown { + tpl = tpls.Investigating + tplData["FailReason"] = strings.Join(errors, "\n - ") + l.Warnf("updating component. Monitor is down: %v", tplData["FailReason"]) + } else { + // was down, created an incident, its now ok, make it resolved. + tpl = tpls.Fixed + l.Warn("Resolving incident") + } + + tplData["incident"] = incident + subject, message := tpl.Exec(tplData) + + if incident.ID == 0 { + incident.Name = subject + incident.Message = message + } else { + incident.Message += "\n\n---\n\n" + subject + ":\n\n" + message + } + + if status == monitors.MonitorStatusDown && (incident.ComponentStatus == 0 || incident.ComponentStatus > 2) { + incident.Status = 1 + fmt.Println("incident status", incident.ComponentStatus) + if incident.ComponentStatus >= 3 { + // major outage + incident.ComponentStatus = 4 + } else { + incident.ComponentStatus = 3 + } + } else if status == monitors.MonitorStatusUp { + incident.Status = 4 + incident.ComponentStatus = 1 + } + incident.Notify = true + + // create/update incident + if err := incident.Send(api); err != nil { + l.Errorf("Error sending incident: %v", err) + return err + } + + return nil +} + +func (api CachetBackend) Tick(monitor monitors.MonitorInterface, status monitors.MonitorStatus, errs []error, lag int64) { + mon := monitor.GetMonitor() + if mon.GetLastStatus() == status || status == monitors.MonitorStatusNotSaturated { + return + } + + logrus.Infof("updating backend for monitor") + lastStatus := mon.UpdateLastStatus(status) + + api.UpdateMonitor(monitor, status, lastStatus, errs) + + if _, ok := mon.Params["metric_id"]; ok && lag > 0 { + api.SendMetric(monitor, lag) + } +} + +func (api CachetBackend) getComponent(componentID int) (*Component, error) { + return nil, nil +} + +func (api CachetBackend) findIncident(componentID int) (*Incident, error) { + // fetch watching, identified & investigating + statuses := []int{3, 2, 1} + for _, status := range statuses { + incidents, err := api.findIncidents(componentID, status) + if err != nil { + return nil, err + } + + for _, incident := range incidents { + incident.Status = status + return incident, nil + } + } + + return nil, nil +} + +func (api CachetBackend) findIncidents(componentID int, status int) ([]*Incident, error) { + resp, body, err := api.NewRequest("GET", "/incidents?component_id="+strconv.Itoa(componentID)+"&status="+strconv.Itoa(status), nil) + if err != nil { + return nil, err + } + + var data []*Incident + if err := json.Unmarshal(body.(CachetResponse).Data, &data); err != nil { + return nil, fmt.Errorf("Cannot find incidents: %v", err) + } + + if resp.StatusCode != 200 { + return nil, fmt.Errorf("Could not fetch incidents! %v", err) + } + + return data, nil +} diff --git a/backends/cachet/component.go b/backends/cachet/component.go new file mode 100644 index 0000000..3469368 --- /dev/null +++ b/backends/cachet/component.go @@ -0,0 +1,14 @@ +package cachetbackend + +// Incident Cachet data model +type Component struct { + ID int `json:"id"` + Name string `json:"name"` + Message string `json:"message"` + Status int `json:"status"` + Visible int `json:"visible"` + Notify bool `json:"notify"` + + ComponentID int `json:"component_id"` + ComponentStatus int `json:"component_status"` +} diff --git a/backends/cachet/incident.go b/backends/cachet/incident.go new file mode 100644 index 0000000..9887956 --- /dev/null +++ b/backends/cachet/incident.go @@ -0,0 +1,71 @@ +package cachetbackend + +import ( + "encoding/json" + "fmt" + "strconv" + "time" + + "github.com/castawaylabs/cachet-monitor/backends" + "github.com/castawaylabs/cachet-monitor/monitors" +) + +// "encoding/json" +// "fmt" +// "strconv" + +// "github.com/sirupsen/logrus" + +// Incident Cachet data model +type Incident struct { + ID int `json:"id"` + Name string `json:"name"` + Message string `json:"message"` + Status int `json:"status"` + Visible int `json:"visible"` + Notify bool `json:"notify"` + + ComponentID int `json:"component_id"` + ComponentStatus int `json:"component_status"` +} + +// Send - Create or Update incident +func (incident *Incident) Send(backend backends.BackendInterface) error { + requestURL := "/incidents" + requestMethod := "POST" + jsonBytes, _ := json.Marshal(incident) + + if incident.ID > 0 { + // create an incident update + requestMethod = "PUT" + requestURL += "/" + strconv.Itoa(incident.ID) + } + + resp, body, err := backend.NewRequest(requestMethod, requestURL, jsonBytes) + if err != nil { + return err + } + + var data struct { + ID int `json:"id"` + } + if err := json.Unmarshal(body.(CachetResponse).Data, &data); err != nil { + return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body.(CachetResponse).Data)) + } + + incident.ID = data.ID + if resp.StatusCode != 200 { + return fmt.Errorf("Could not update/create incident!") + } + + return nil +} + +func (api *CachetBackend) getTemplateData(monitor *monitors.AbstractMonitor) map[string]interface{} { + return map[string]interface{}{ + // "SystemName": monitor.config.SystemName, + "Monitor": monitor, + "now": time.Now().Format(api.DateFormat), + // "incident": monitor.incident, + } +} diff --git a/backends/interface.go b/backends/interface.go new file mode 100644 index 0000000..ae3fec0 --- /dev/null +++ b/backends/interface.go @@ -0,0 +1,19 @@ +package backends + +import ( + "net/http" + + "github.com/castawaylabs/cachet-monitor/monitors" +) + +type BackendInterface interface { + Ping() error + Tick(monitor monitors.MonitorInterface, status monitors.MonitorStatus, errs []error, lag int64) + SendMetric(monitor monitors.MonitorInterface, lag int64) error + UpdateMonitor(monitor monitors.MonitorInterface, status, previousStatus monitors.MonitorStatus, errs []error) error + NewRequest(requestType, url string, reqBody []byte) (*http.Response, interface{}, error) + + Describe() []string + Validate() []string + ValidateMonitor(monitor *monitors.AbstractMonitor) []string +} diff --git a/cli/main.go b/cli/main.go index 2ac47ce..736ef31 100644 --- a/cli/main.go +++ b/cli/main.go @@ -1,198 +1,5 @@ package main -import ( - "encoding/json" - "errors" - "io/ioutil" - "net/http" - "net/url" - "os" - "os/signal" - "strings" - "sync" - - "github.com/Sirupsen/logrus" - cachet "github.com/castawaylabs/cachet-monitor" - docopt "github.com/docopt/docopt-go" - "github.com/mitchellh/mapstructure" - "gopkg.in/yaml.v2" -) - -const usage = `cachet-monitor - -Usage: - cachet-monitor (-c PATH | --config PATH) [--log=LOGPATH] [--name=NAME] [--immediate] - cachet-monitor -h | --help | --version - -Arguments: - PATH path to config.json - LOGPATH path to log output (defaults to STDOUT) - NAME name of this logger - -Examples: - cachet-monitor -c /root/cachet-monitor.json - cachet-monitor -c /root/cachet-monitor.json --log=/var/log/cachet-monitor.log --name="development machine" - -Options: - -c PATH.json --config PATH Path to configuration file - -h --help Show this screen. - --version Show version - --immediate Tick immediately (by default waits for first defined interval) - -Environment varaibles: - CACHET_API override API url from configuration - CACHET_TOKEN override API token from configuration - CACHET_DEV set to enable dev logging` - -var version string - func main() { - arguments, _ := docopt.Parse(usage, nil, true, version, false) - - cfg, err := getConfiguration(arguments["--config"].(string)) - if err != nil { - logrus.Panicf("Unable to start (reading config): %v", err) - } - - if immediate, ok := arguments["--immediate"]; ok { - cfg.Immediate = immediate.(bool) - } - - if name := arguments["--name"]; name != nil { - cfg.SystemName = name.(string) - } - logrus.SetOutput(getLogger(arguments["--log"])) - - if len(os.Getenv("CACHET_API")) > 0 { - cfg.API.URL = os.Getenv("CACHET_API") - } - if len(os.Getenv("CACHET_TOKEN")) > 0 { - cfg.API.Token = os.Getenv("CACHET_TOKEN") - } - if len(os.Getenv("CACHET_DEV")) > 0 { - logrus.SetLevel(logrus.DebugLevel) - } - - if valid := cfg.Validate(); !valid { - logrus.Errorf("Invalid configuration") - os.Exit(1) - } - - logrus.Debug("Configuration valid") - logrus.Infof("System: %s", cfg.SystemName) - logrus.Infof("API: %s", cfg.API.URL) - logrus.Infof("Monitors: %d\n", len(cfg.Monitors)) - - logrus.Infof("Pinging cachet") - if err := cfg.API.Ping(); err != nil { - logrus.Errorf("Cannot ping cachet!\n%v", err) - os.Exit(1) - } - logrus.Infof("Ping OK") - - wg := &sync.WaitGroup{} - for index, monitor := range cfg.Monitors { - logrus.Infof("Starting Monitor #%d: ", index) - logrus.Infof("Features: \n - %v", strings.Join(monitor.Describe(), "\n - ")) - - go monitor.ClockStart(cfg, monitor, wg) - } - - signals := make(chan os.Signal, 1) - signal.Notify(signals, os.Interrupt, os.Kill) - <-signals - - logrus.Warnf("Abort: Waiting monitors to finish") - for _, mon := range cfg.Monitors { - mon.GetMonitor().ClockStop() - } - - wg.Wait() -} - -func getLogger(logPath interface{}) *os.File { - if logPath == nil || len(logPath.(string)) == 0 { - return os.Stdout - } - - file, err := os.Create(logPath.(string)) - if err != nil { - logrus.Errorf("Unable to open file '%v' for logging: \n%v", logPath, err) - os.Exit(1) - } - - return file -} - -func getConfiguration(path string) (*cachet.CachetMonitor, error) { - var cfg cachet.CachetMonitor - var data []byte - - // test if its a url - url, err := url.ParseRequestURI(path) - if err == nil && len(url.Scheme) > 0 { - // download config - response, err := http.Get(path) - if err != nil { - logrus.Warn("Unable to download network configuration") - return nil, err - } - - defer response.Body.Close() - data, _ = ioutil.ReadAll(response.Body) - - logrus.Info("Downloaded network configuration.") - } else { - data, err = ioutil.ReadFile(path) - if err != nil { - return nil, errors.New("Unable to open file: '" + path + "'") - } - } - - if strings.HasSuffix(path, ".yaml") || strings.HasSuffix(path, ".yml") { - err = yaml.Unmarshal(data, &cfg) - } else { - err = json.Unmarshal(data, &cfg) - } - - if err != nil { - logrus.Warnf("Unable to parse configuration file") - } - - cfg.Monitors = make([]cachet.MonitorInterface, len(cfg.RawMonitors)) - for index, rawMonitor := range cfg.RawMonitors { - var t cachet.MonitorInterface - var err error - - // get default type - monType := cachet.GetMonitorType("") - if t, ok := rawMonitor["type"].(string); ok { - monType = cachet.GetMonitorType(t) - } - - switch monType { - case "http": - var s cachet.HTTPMonitor - err = mapstructure.Decode(rawMonitor, &s) - t = &s - case "dns": - var s cachet.DNSMonitor - err = mapstructure.Decode(rawMonitor, &s) - t = &s - default: - logrus.Errorf("Invalid monitor type (index: %d) %v", index, monType) - continue - } - - t.GetMonitor().Type = monType - - if err != nil { - logrus.Errorf("Unable to unmarshal monitor to type (index: %d): %v", index, err) - continue - } - - cfg.Monitors[index] = t - } - - return &cfg, err + Execute() } diff --git a/cli/root.go b/cli/root.go new file mode 100644 index 0000000..648a0a4 --- /dev/null +++ b/cli/root.go @@ -0,0 +1,117 @@ +package main + +import ( + "os" + "os/signal" + "strings" + "sync" + + cachet "github.com/castawaylabs/cachet-monitor" + "github.com/sirupsen/logrus" + "github.com/spf13/cobra" +) + +var cfgFile string + +// rootCmd represents the base command when called without any subcommands +var rootCmd = &cobra.Command{ + Use: "cmd", + Short: "cachet-monitor", + // Uncomment the following line if your bare application + // has an action associated with it: + Run: func(cmd *cobra.Command, args []string) { + Action(cmd, args) + }, +} + +// Execute adds all child commands to the root command and sets flags appropriately. +// This is called by main.main(). It only needs to happen once to the rootCmd. +func Execute() { + if err := rootCmd.Execute(); err != nil { + panic(err) + } +} + +func init() { + // Here you will define your flags and configuration settings. + // Cobra supports persistent flags, which, if defined here, + // will be global for your application. + pf := rootCmd.PersistentFlags() + pf.StringVarP(&cfgFile, "config", "c", "", "config file (default is $(pwd)/config.yml)") + pf.String("log", "", "log output") + pf.String("format", "text", "log format [text/json]") + pf.String("name", "", "machine name") + pf.Bool("immediate", false, "Tick immediately (by default waits for first defined") +} + +func Action(cmd *cobra.Command, args []string) { + cfg, err := cachet.New(cfgFile) + if err != nil { + logrus.Panicf("Unable to start (reading config): %v", err) + } + + if immediate, err := cmd.Flags().GetBool("immediate"); err == nil && immediate { + cfg.Immediate = immediate + } + if name, err := cmd.Flags().GetString("name"); err == nil && len(name) > 0 { + cfg.SystemName = name + } + + logrus.SetOutput(getLogger(cmd)) + if format, err := cmd.Flags().GetString("format"); err == nil && format == "json" { + logrus.SetFormatter(&logrus.JSONFormatter{}) + } + + if valid := cfg.Validate(); !valid { + logrus.Errorf("Invalid configuration") + os.Exit(1) + } + + logrus.Debug("Configuration valid") + logrus.Infof("System: %s", cfg.SystemName) + // logrus.Infof("API: %s", cfg.API.URL) + logrus.Infof("Monitors: %d", len(cfg.Monitors)) + logrus.Infof("Backend: %v", strings.Join(cfg.Backend.Describe(), "\n - ")) + + logrus.Infof("Pinging backend") + if err := cfg.Backend.Ping(); err != nil { + logrus.Errorf("Cannot ping backend!\n%v", err) + // os.Exit(1) + } + logrus.Infof("Ping OK") + logrus.Warnf("Starting!") + + wg := &sync.WaitGroup{} + for index, monitor := range cfg.Monitors { + logrus.Infof("Starting Monitor #%d: ", index) + logrus.Infof("Features: \n - %v", strings.Join(monitor.Describe(), "\n - ")) + + go monitor.Start(monitor.GetTestFunc(), wg, cfg.Backend.Tick, cfg.Immediate) + } + + signals := make(chan os.Signal, 1) + signal.Notify(signals, os.Interrupt, os.Kill) + <-signals + + logrus.Warnf("Abort: Waiting for monitors to finish") + for _, mon := range cfg.Monitors { + mon.GetMonitor().Stop() + } + + wg.Wait() +} + +func getLogger(cmd *cobra.Command) *os.File { + logPath, _ := cmd.Flags().GetString("log") + if len(logPath) == 0 { + return os.Stdout + } + + file, err := os.Create(logPath) + if err != nil { + logrus.Errorf("Unable to open file '%v' for logging: \n%v", logPath, err) + os.Exit(1) + } + + return file +} diff --git a/config.go b/config.go index af2a05c..6722f3c 100644 --- a/config.go +++ b/config.go @@ -1,22 +1,130 @@ package cachet import ( + "encoding/json" + "errors" + "io/ioutil" "net" + "net/http" + "net/url" "os" "strings" - "time" - "github.com/Sirupsen/logrus" + "github.com/castawaylabs/cachet-monitor/backends" + cachetbackend "github.com/castawaylabs/cachet-monitor/backends/cachet" + "github.com/castawaylabs/cachet-monitor/monitors" + + "github.com/mitchellh/mapstructure" + "github.com/sirupsen/logrus" + yaml "gopkg.in/yaml.v2" ) type CachetMonitor struct { - SystemName string `json:"system_name" yaml:"system_name"` - DateFormat string `json:"date_format" yaml:"date_format"` - API CachetAPI `json:"api"` RawMonitors []map[string]interface{} `json:"monitors" yaml:"monitors"` + RawBackend map[string]interface{} `json:"backend" yaml:"backend"` - Monitors []MonitorInterface `json:"-" yaml:"-"` - Immediate bool `json:"-" yaml:"-"` + SystemName string `json:"system_name" yaml:"system_name"` + Backend backends.BackendInterface `json:"-" yaml:"-"` + Monitors []monitors.MonitorInterface `json:"-" yaml:"-"` + Immediate bool `json:"-" yaml:"-"` +} + +func New(path string) (*CachetMonitor, error) { + var cfg *CachetMonitor + var data []byte + + // test if its a url + url, err := url.ParseRequestURI(path) + if err == nil && len(url.Scheme) > 0 { + // download config + response, err := http.Get(path) + if err != nil { + logrus.Warn("Unable to download network configuration") + return nil, err + } + + defer response.Body.Close() + data, _ = ioutil.ReadAll(response.Body) + + logrus.Info("Downloaded network configuration.") + } else { + data, err = ioutil.ReadFile(path) + if err != nil { + return nil, errors.New("Unable to open file: '" + path + "'") + } + } + + if strings.HasSuffix(path, ".yaml") || strings.HasSuffix(path, ".yml") { + err = yaml.Unmarshal(data, &cfg) + } else { + err = json.Unmarshal(data, &cfg) + } + + if err != nil { + logrus.Warnf("Unable to parse configuration file") + return nil, err + } + + // get default type + if backend, ok := cfg.RawBackend["type"].(string); !ok { + err = errors.New("Cannot determine backend type") + } else { + switch backend { + case "cachet": + var backend cachetbackend.CachetBackend + err = mapstructure.Decode(cfg.RawBackend, &backend) + cfg.Backend = &backend + // backend.config = cfg + default: + err = errors.New("Invalid backend type: %v" + backend) + } + } + + if errs := cfg.Backend.Validate(); len(errs) > 0 { + logrus.Errorf("Backend validation errors: %v", errs) + os.Exit(1) + } + + if err != nil { + logrus.Errorf("Unable to unmarshal backend: %v", err) + return nil, err + } + + cfg.Monitors = make([]monitors.MonitorInterface, len(cfg.RawMonitors)) + for index, rawMonitor := range cfg.RawMonitors { + var t monitors.MonitorInterface + + // get default type + monType := GetMonitorType("") + if t, ok := rawMonitor["type"].(string); ok { + monType = GetMonitorType(t) + } + + switch monType { + case "http": + var mon monitors.HTTPMonitor + err = mapstructure.Decode(rawMonitor, &mon) + t = &mon + case "dns": + var mon monitors.DNSMonitor + err = mapstructure.Decode(rawMonitor, &mon) + t = &mon + default: + logrus.Errorf("Invalid monitor type (index: %d) %v", index, monType) + continue + } + + if err != nil { + logrus.Errorf("Unable to unmarshal monitor to type (index: %d): %v", index, err) + continue + } + + mon := t.GetMonitor() + mon.Type = monType + cfg.Monitors[index] = t + } + + return cfg, err } // Validate configuration @@ -28,22 +136,13 @@ func (cfg *CachetMonitor) Validate() bool { cfg.SystemName = getHostname() } - if len(cfg.DateFormat) == 0 { - cfg.DateFormat = DefaultTimeFormat - } - - if len(cfg.API.Token) == 0 || len(cfg.API.URL) == 0 { - logrus.Warnf("API URL or API Token missing.\nGet help at https://github.com/castawaylabs/cachet-monitor") - valid = false - } - if len(cfg.Monitors) == 0 { logrus.Warnf("No monitors defined!\nSee help for example configuration") valid = false } for index, monitor := range cfg.Monitors { - if errs := monitor.Validate(); len(errs) > 0 { + if errs := monitor.Validate(cfg.Backend.ValidateMonitor); len(errs) > 0 { logrus.Warnf("Monitor validation errors (index %d): %v", index, "\n - "+strings.Join(errs, "\n - ")) valid = false } @@ -67,10 +166,6 @@ func getHostname() string { return addrs[0].String() } -func getMs() int64 { - return time.Now().UnixNano() / int64(time.Millisecond) -} - func GetMonitorType(t string) string { if len(t) == 0 { return "http" @@ -78,12 +173,3 @@ func GetMonitorType(t string) string { return strings.ToLower(t) } - -func getTemplateData(monitor *AbstractMonitor) map[string]interface{} { - return map[string]interface{}{ - "SystemName": monitor.config.SystemName, - "API": monitor.config.API, - "Monitor": monitor, - "now": time.Now().Format(monitor.config.DateFormat), - } -} diff --git a/incident.go b/incident.go deleted file mode 100644 index 597e84f..0000000 --- a/incident.go +++ /dev/null @@ -1,112 +0,0 @@ -package cachet - -import ( - "encoding/json" - "fmt" - "strconv" - - "github.com/Sirupsen/logrus" -) - -// Incident Cachet data model -type Incident struct { - ID int `json:"id"` - Name string `json:"name"` - Message string `json:"message"` - Status int `json:"status"` - Visible int `json"visible"` - Notify bool `json:"notify"` - - ComponentID int `json:"component_id"` - ComponentStatus int `json:"component_status"` -} - -// Send - Create or Update incident -func (incident *Incident) Send(cfg *CachetMonitor) error { - switch incident.Status { - case 1, 2, 3: - // partial outage - incident.ComponentStatus = 3 - - componentStatus, err := incident.GetComponentStatus(cfg) - if componentStatus == 3 { - // major outage - incident.ComponentStatus = 4 - } - - if err != nil { - logrus.Warnf("cannot fetch component: %v", err) - } - case 4: - // fixed - incident.ComponentStatus = 1 - } - - requestType := "POST" - requestURL := "/incidents" - if incident.ID > 0 { - requestType = "PUT" - requestURL += "/" + strconv.Itoa(incident.ID) - } - - jsonBytes, _ := json.Marshal(incident) - - resp, body, err := cfg.API.NewRequest(requestType, requestURL, jsonBytes) - if err != nil { - return err - } - - var data struct { - ID int `json:"id"` - } - if err := json.Unmarshal(body.Data, &data); err != nil { - return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body.Data)) - } - - incident.ID = data.ID - if resp.StatusCode != 200 { - return fmt.Errorf("Could not create/update incident!") - } - - return nil -} - -func (incident *Incident) GetComponentStatus(cfg *CachetMonitor) (int, error) { - resp, body, err := cfg.API.NewRequest("GET", "/components/"+strconv.Itoa(incident.ComponentID), nil) - if err != nil { - return 0, err - } - - if resp.StatusCode != 200 { - return 0, fmt.Errorf("Invalid status code. Received %d", resp.StatusCode) - } - - var data struct { - Status int `json:"status,string"` - } - if err := json.Unmarshal(body.Data, &data); err != nil { - return 0, fmt.Errorf("Cannot parse component body: %v. Err = %v", string(body.Data), err) - } - - return data.Status, nil -} - -// SetInvestigating sets status to Investigating -func (incident *Incident) SetInvestigating() { - incident.Status = 1 -} - -// SetIdentified sets status to Identified -func (incident *Incident) SetIdentified() { - incident.Status = 2 -} - -// SetWatching sets status to Watching -func (incident *Incident) SetWatching() { - incident.Status = 3 -} - -// SetFixed sets status to Fixed -func (incident *Incident) SetFixed() { - incident.Status = 4 -} diff --git a/make.sh b/make.sh new file mode 100755 index 0000000..95b1770 --- /dev/null +++ b/make.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +if [ "$1" == "test" ]; then + reflex -r '\.go$' -s -d none -- sh -c 'go test ./...' +fi + +reflex -r '\.go$' -s -d none -- sh -c 'go build -o ./cachet-monitor ./cli/ && ./cachet-monitor -c config.yml' +exit 0 diff --git a/monitor.go b/monitor.go deleted file mode 100644 index f7cbba6..0000000 --- a/monitor.go +++ /dev/null @@ -1,252 +0,0 @@ -package cachet - -import ( - "sync" - "time" - - "github.com/Sirupsen/logrus" -) - -const DefaultInterval = time.Second * 60 -const DefaultTimeout = time.Second -const DefaultTimeFormat = "15:04:05 Jan 2 MST" -const HistorySize = 10 - -type MonitorInterface interface { - ClockStart(*CachetMonitor, MonitorInterface, *sync.WaitGroup) - ClockStop() - tick(MonitorInterface) - test() bool - - Validate() []string - GetMonitor() *AbstractMonitor - Describe() []string -} - -// AbstractMonitor data model -type AbstractMonitor struct { - Name string - Target string - - // (default)http / dns - Type string - Strict bool - - Interval time.Duration - Timeout time.Duration - - MetricID int `mapstructure:"metric_id"` - ComponentID int `mapstructure:"component_id"` - - // Templating stuff - Template struct { - Investigating MessageTemplate - Fixed MessageTemplate - } - - // Threshold = percentage / number of down incidents - Threshold float32 - ThresholdCount bool `mapstructure:"threshold_count"` - - // lag / average(lagHistory) * 100 = percentage above average lag - // PerformanceThreshold sets the % limit above which this monitor will trigger degraded-performance - // PerformanceThreshold float32 - - history []bool - // lagHistory []float32 - lastFailReason string - incident *Incident - config *CachetMonitor - - // Closed when mon.Stop() is called - stopC chan bool -} - -func (mon *AbstractMonitor) Validate() []string { - errs := []string{} - - if len(mon.Name) == 0 { - errs = append(errs, "Name is required") - } - - if mon.Interval < 1 { - mon.Interval = DefaultInterval - } - if mon.Timeout < 1 { - mon.Timeout = DefaultTimeout - } - - if mon.Timeout > mon.Interval { - errs = append(errs, "Timeout greater than interval") - } - - if mon.ComponentID == 0 && mon.MetricID == 0 { - errs = append(errs, "component_id & metric_id are unset") - } - - if mon.Threshold <= 0 { - mon.Threshold = 100 - } - - if err := mon.Template.Fixed.Compile(); err != nil { - errs = append(errs, "Could not compile \"fixed\" template: "+err.Error()) - } - if err := mon.Template.Investigating.Compile(); err != nil { - errs = append(errs, "Could not compile \"investigating\" template: "+err.Error()) - } - - return errs -} -func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor { - return mon -} -func (mon *AbstractMonitor) Describe() []string { - features := []string{"Type: " + mon.Type} - - if len(mon.Name) > 0 { - features = append(features, "Name: "+mon.Name) - } - - return features -} - -func (mon *AbstractMonitor) ClockStart(cfg *CachetMonitor, iface MonitorInterface, wg *sync.WaitGroup) { - wg.Add(1) - mon.config = cfg - mon.stopC = make(chan bool) - if cfg.Immediate { - mon.tick(iface) - } - - ticker := time.NewTicker(mon.Interval * time.Second) - for { - select { - case <-ticker.C: - mon.tick(iface) - case <-mon.stopC: - wg.Done() - return - } - } -} - -func (mon *AbstractMonitor) ClockStop() { - select { - case <-mon.stopC: - return - default: - close(mon.stopC) - } -} - -func (mon *AbstractMonitor) test() bool { return false } - -func (mon *AbstractMonitor) tick(iface MonitorInterface) { - reqStart := getMs() - up := iface.test() - lag := getMs() - reqStart - - histSize := HistorySize - if mon.ThresholdCount { - histSize = int(mon.Threshold) - } - - if len(mon.history) == histSize-1 { - logrus.Warnf("%v is now saturated", mon.Name) - } - if len(mon.history) >= histSize { - mon.history = mon.history[len(mon.history)-(histSize-1):] - } - mon.history = append(mon.history, up) - mon.AnalyseData() - - // report lag - if mon.MetricID > 0 { - go mon.config.API.SendMetric(mon.MetricID, lag) - } -} - -// TODO: test -// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident -func (mon *AbstractMonitor) AnalyseData() { - // look at the past few incidents - numDown := 0 - for _, wasUp := range mon.history { - if wasUp == false { - numDown++ - } - } - - t := (float32(numDown) / float32(len(mon.history))) * 100 - l := logrus.WithFields(logrus.Fields{ - "monitor": mon.Name, - "time": time.Now().Format(mon.config.DateFormat), - }) - if numDown == 0 { - l.Printf("monitor is up") - } else if mon.ThresholdCount { - l.Printf("monitor down %d/%d", numDown, int(mon.Threshold)) - } else { - l.Printf("monitor down %.2f%%/%.2f%%", t, mon.Threshold) - } - - histSize := HistorySize - if mon.ThresholdCount { - histSize = int(mon.Threshold) - } - - if len(mon.history) != histSize { - // not saturated - return - } - - triggered := (mon.ThresholdCount && numDown == int(mon.Threshold)) || (!mon.ThresholdCount && t > mon.Threshold) - - if triggered && mon.incident == nil { - // create incident - tplData := getTemplateData(mon) - tplData["FailReason"] = mon.lastFailReason - - subject, message := mon.Template.Investigating.Exec(tplData) - mon.incident = &Incident{ - Name: subject, - ComponentID: mon.ComponentID, - Message: message, - Notify: true, - } - - // is down, create an incident - l.Warnf("creating incident. Monitor is down: %v", mon.lastFailReason) - // set investigating status - mon.incident.SetInvestigating() - // create/update incident - if err := mon.incident.Send(mon.config); err != nil { - l.Printf("Error sending incident: %v", err) - } - - return - } - - // still triggered or no incident - if triggered || mon.incident == nil { - return - } - - // was down, created an incident, its now ok, make it resolved. - l.Warn("Resolving incident") - - // resolve incident - tplData := getTemplateData(mon) - tplData["incident"] = mon.incident - - subject, message := mon.Template.Fixed.Exec(tplData) - mon.incident.Name = subject - mon.incident.Message = message - mon.incident.SetFixed() - if err := mon.incident.Send(mon.config); err != nil { - l.Printf("Error sending incident: %v", err) - } - - mon.lastFailReason = "" - mon.incident = nil -} diff --git a/monitor_test.go b/monitor_test.go deleted file mode 100644 index c75df01..0000000 --- a/monitor_test.go +++ /dev/null @@ -1,7 +0,0 @@ -package cachet - -import ( - "testing" -) - -func TestAnalyseData(t *testing.T) {} diff --git a/dns.go b/monitors/dns.go similarity index 67% rename from dns.go rename to monitors/dns.go index 6fe9372..9d55b91 100644 --- a/dns.go +++ b/monitors/dns.go @@ -1,14 +1,33 @@ -package cachet +package monitors import ( + "errors" "net" "regexp" "strings" - "github.com/Sirupsen/logrus" "github.com/miekg/dns" + "github.com/sirupsen/logrus" ) +// Investigating template +var defaultDNSInvestigatingTpl = MessageTemplate{ + Subject: `{{ .Monitor.Name }} - {{ .SystemName }}`, + Message: `{{ .Monitor.Name }} DNS check **failed** (server time: {{ .now }}) + +{{ .FailReason }}`, +} + +// Fixed template +var defaultDNSFixedTpl = MessageTemplate{ + Subject: `{{ .Monitor.Name }} - {{ .SystemName }}`, + Message: `**Resolved** - {{ .now }} + +- - - + +{{ .incident.Message }}`, +} + type DNSAnswer struct { Regex string regexp *regexp.Regexp @@ -28,8 +47,11 @@ type DNSMonitor struct { Answers []DNSAnswer } -func (monitor *DNSMonitor) Validate() []string { - errs := monitor.AbstractMonitor.Validate() +func (monitor *DNSMonitor) Validate(validate backendValidateFunc) []string { + monitor.Template.Investigating.SetDefault(defaultDNSInvestigatingTpl) + monitor.Template.Fixed.SetDefault(defaultDNSFixedTpl) + + errs := monitor.AbstractMonitor.Validate(validate) if len(monitor.DNS) == 0 { config, _ := dns.ClientConfigFromFile("/etc/resolv.conf") @@ -61,7 +83,7 @@ func (monitor *DNSMonitor) Validate() []string { return errs } -func (monitor *DNSMonitor) test() bool { +func (monitor *DNSMonitor) test() (bool, []error) { m := new(dns.Msg) m.SetQuestion(dns.Fqdn(monitor.Target), monitor.question) m.RecursionDesired = true @@ -70,11 +92,11 @@ func (monitor *DNSMonitor) test() bool { r, _, err := c.Exchange(m, monitor.DNS) if err != nil { logrus.Warnf("DNS error: %v", err) - return false + return false, []error{err} } if r.Rcode != dns.RcodeSuccess { - return false + return false, []error{errors.New("Invalid status code returned")} } for _, check := range monitor.Answers { @@ -88,11 +110,11 @@ func (monitor *DNSMonitor) test() bool { if !found { logrus.Warnf("DNS check failed: %v. Not found in any of %v", check, r.Answer) - return false + return false, []error{errors.New("Record not found")} } } - return true + return true, nil } func findDNSType(t string) uint16 { diff --git a/http.go b/monitors/http.go similarity index 78% rename from http.go rename to monitors/http.go index 27c8b3a..8b7ee37 100644 --- a/http.go +++ b/monitors/http.go @@ -1,7 +1,8 @@ -package cachet +package monitors import ( "crypto/tls" + "errors" "io/ioutil" "net/http" "regexp" @@ -13,7 +14,7 @@ import ( // Investigating template var defaultHTTPInvestigatingTpl = MessageTemplate{ Subject: `{{ .Monitor.Name }} - {{ .SystemName }}`, - Message: `{{ .Monitor.Name }} check **failed** (server time: {{ .now }}) + Message: `{{ .Monitor.Name }} HTTP check **failed** (server time: {{ .now }}) {{ .FailReason }}`, } @@ -41,7 +42,7 @@ type HTTPMonitor struct { } // TODO: test -func (monitor *HTTPMonitor) test() bool { +func (monitor *HTTPMonitor) test() (bool, []error) { req, err := http.NewRequest(monitor.Method, monitor.Target, nil) for k, v := range monitor.Headers { req.Header.Add(k, v) @@ -56,40 +57,38 @@ func (monitor *HTTPMonitor) test() bool { resp, err := client.Do(req) if err != nil { - monitor.lastFailReason = err.Error() - return false + return false, []error{err} } defer resp.Body.Close() if monitor.ExpectedStatusCode > 0 && resp.StatusCode != monitor.ExpectedStatusCode { - monitor.lastFailReason = "Expected HTTP response status: " + strconv.Itoa(monitor.ExpectedStatusCode) + ", got: " + strconv.Itoa(resp.StatusCode) - return false + fail := "Expected HTTP response status: " + strconv.Itoa(monitor.ExpectedStatusCode) + ", got: " + strconv.Itoa(resp.StatusCode) + return false, []error{errors.New(fail)} } if monitor.bodyRegexp != nil { // check response body responseBody, err := ioutil.ReadAll(resp.Body) if err != nil { - monitor.lastFailReason = err.Error() - return false + return false, []error{err} } if !monitor.bodyRegexp.Match(responseBody) { - monitor.lastFailReason = "Unexpected body: " + string(responseBody) + ".\nExpected to match: " + monitor.ExpectedBody - return false + fail := "Unexpected body: " + string(responseBody) + ".\nExpected to match: " + monitor.ExpectedBody + return false, []error{errors.New(fail)} } } - return true + return true, nil } // TODO: test -func (mon *HTTPMonitor) Validate() []string { +func (mon *HTTPMonitor) Validate(validate backendValidateFunc) []string { mon.Template.Investigating.SetDefault(defaultHTTPInvestigatingTpl) mon.Template.Fixed.SetDefault(defaultHTTPFixedTpl) - errs := mon.AbstractMonitor.Validate() + errs := mon.AbstractMonitor.Validate(validate) if len(mon.ExpectedBody) > 0 { exp, err := regexp.Compile(mon.ExpectedBody) diff --git a/monitors/monitor.go b/monitors/monitor.go new file mode 100644 index 0000000..fbd5442 --- /dev/null +++ b/monitors/monitor.go @@ -0,0 +1,257 @@ +package monitors + +import ( + "sync" + "time" + + "github.com/sirupsen/logrus" +) + +const DefaultInterval = time.Second * 60 +const DefaultTimeout = time.Second +const HistorySize = 10 + +type MonitorStatus string + +const ( + MonitorStatusUp = "up" + MonitorStatusDown = "down" + MonitorStatusNotSaturated = "unsaturated" +) + +type backendValidateFunc = func(monitor *AbstractMonitor) []string +type MonitorTestFunc func() (up bool, errs []error) +type MonitorTickFunc func(monitor MonitorInterface, status MonitorStatus, errs []error, lag int64) + +type MonitorInterface interface { + Start(MonitorTestFunc, *sync.WaitGroup, MonitorTickFunc, bool) + Stop() + + tick(MonitorTestFunc) (status MonitorStatus, errors []error, lag int64) + test() (bool, []error) + + Validate(validate backendValidateFunc) []string + Describe() []string + + GetMonitor() *AbstractMonitor + GetTestFunc() MonitorTestFunc + GetLastStatus() MonitorStatus + UpdateLastStatus(status MonitorStatus) (old MonitorStatus) +} + +// AbstractMonitor data model +type AbstractMonitor struct { + Name string + Target string + + // (default)http / dns + Type string + Strict bool + + Interval time.Duration + Timeout time.Duration + Params map[string]interface{} + + // Templating stuff + Template MonitorTemplates + + // Threshold = percentage / number of down incidents + Threshold float32 + ThresholdCount bool `mapstructure:"threshold_count"` + + // lag / average(lagHistory) * 100 = percentage above average lag + // PerformanceThreshold sets the % limit above which this monitor will trigger degraded-performance + // PerformanceThreshold float32 + + history []bool + lastStatus MonitorStatus + + // Closed when mon.Stop() is called + stopC chan bool +} + +func (mon *AbstractMonitor) Validate(validate backendValidateFunc) []string { + errs := []string{} + + if len(mon.Name) == 0 { + errs = append(errs, "Name is required") + } + + if mon.Interval < 1 { + mon.Interval = DefaultInterval + } + if mon.Timeout < 1 { + mon.Timeout = DefaultTimeout + } + + if mon.Timeout > mon.Interval { + errs = append(errs, "Timeout greater than interval") + } + + // get the backend to validate the monitor + errs = append(errs, validate(mon)...) + + if mon.Threshold <= 0 { + mon.Threshold = 100 + } + + // if len(mon.Template.Fixed.Message) == 0 || len(mon.Template.Fixed.Subject) == 0 { + // errs = append(errs, "\"fixed\" template empty/missing") + // } + // if len(mon.Template.Investigating.Message) == 0 || len(mon.Template.Investigating.Subject) == 0 { + // errs = append(errs, "\"investigating\" template empty/missing") + // } + if err := mon.Template.Fixed.Compile(); err != nil { + errs = append(errs, "Could not compile \"fixed\" template: "+err.Error()) + } + if err := mon.Template.Investigating.Compile(); err != nil { + errs = append(errs, "Could not compile \"investigating\" template: "+err.Error()) + } + + return errs +} + +func (mon *AbstractMonitor) GetMonitor() *AbstractMonitor { + return mon +} + +func (mon *AbstractMonitor) Describe() []string { + features := []string{"Type: " + mon.Type} + + if len(mon.Name) > 0 { + features = append(features, "Name: "+mon.Name) + } + + return features +} + +func (mon *AbstractMonitor) Start(testFunc MonitorTestFunc, wg *sync.WaitGroup, tickFunc MonitorTickFunc, immediate bool) { + wg.Add(1) + + mon.stopC = make(chan bool) + if immediate { + status, errs, lag := mon.tick(testFunc) + tickFunc(mon, status, errs, lag) + } + + ticker := time.NewTicker(mon.Interval * time.Second) + for { + select { + case <-ticker.C: + status, errs, lag := mon.tick(testFunc) + tickFunc(mon, status, errs, lag) + case <-mon.stopC: + wg.Done() + return + } + } +} + +func (mon *AbstractMonitor) Stop() { + select { + case <-mon.stopC: + return + default: + close(mon.stopC) + } +} + +func (mon *AbstractMonitor) tick(testFunc MonitorTestFunc) (status MonitorStatus, errors []error, lag int64) { + reqStart := getMs() + up, errs := testFunc() + lag = getMs() - reqStart + + histSize := HistorySize + if mon.ThresholdCount { + histSize = int(mon.Threshold) + } + + if len(mon.history) == histSize-1 { + logrus.WithFields(logrus.Fields{ + "monitor": mon.Name, + }).Warn("monitor saturated") + } + if len(mon.history) >= histSize { + mon.history = mon.history[len(mon.history)-(histSize-1):] + } + mon.history = append(mon.history, up) + status = mon.GetStatus() + errors = errs + + return +} + +// TODO: test +// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident +func (mon *AbstractMonitor) GetStatus() MonitorStatus { + numDown := 0 + for _, wasUp := range mon.history { + if wasUp == false { + numDown++ + } + } + + t := (float32(numDown) / float32(len(mon.history))) * 100 + logFields := logrus.Fields{"monitor": mon.Name} + // stop reporting time for jsonformatter, it's there by default + if _, ok := logrus.StandardLogger().Formatter.(*logrus.JSONFormatter); !ok { + logFields["t"] = time.Now() + } + l := logrus.WithFields(logFields) + + symbol := "⚠️" + if t == 100 { + symbol = "❌" + } + if numDown == 0 { + l.Printf("👍 up") + } else if mon.ThresholdCount { + l.Printf("%v down (%d/%d)", symbol, numDown, int(mon.Threshold)) + } else { + l.Printf("%v down %.0f%%/%.0f%%", symbol, t, mon.Threshold) + } + + histSize := HistorySize + if mon.ThresholdCount { + histSize = int(mon.Threshold) + } + + if len(mon.history) != histSize { + // not saturated + return MonitorStatusNotSaturated + } + + var down bool + if mon.ThresholdCount { + down = numDown >= int(mon.Threshold) + } else { + down = t >= mon.Threshold + } + + if !down { + return MonitorStatusUp + } + + return MonitorStatusDown +} + +func (mon *AbstractMonitor) GetTestFunc() MonitorTestFunc { + return mon.test +} + +func (mon *AbstractMonitor) GetLastStatus() MonitorStatus { + return mon.lastStatus +} + +func (mon *AbstractMonitor) UpdateLastStatus(status MonitorStatus) (old MonitorStatus) { + old = mon.lastStatus + mon.lastStatus = status + + return +} + +func (mon *AbstractMonitor) test() (bool, []error) { return false, nil } + +func getMs() int64 { + return time.Now().UnixNano() / int64(time.Millisecond) +} diff --git a/template.go b/monitors/template.go similarity index 90% rename from template.go rename to monitors/template.go index 495796a..2965bbb 100644 --- a/template.go +++ b/monitors/template.go @@ -1,10 +1,15 @@ -package cachet +package monitors import ( "bytes" "text/template" ) +type MonitorTemplates struct { + Investigating MessageTemplate + Fixed MessageTemplate +} + type MessageTemplate struct { Subject string `json:"subject"` Message string `json:"message"`