diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2ec9320 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +gin-bin +example.config.local.json \ No newline at end of file diff --git a/cli/main.go b/cli/main.go new file mode 100644 index 0000000..4028b4d --- /dev/null +++ b/cli/main.go @@ -0,0 +1,123 @@ +package main + +import ( + "encoding/json" + "errors" + "flag" + "fmt" + "io/ioutil" + "log" + "net/http" + "net/url" + "os" + "os/signal" + "sync" + + cachet "github.com/castawaylabs/cachet-monitor" +) + +var configPath string +var systemName string +var logPath string + +func main() { + flag.StringVar(&configPath, "c", "/etc/cachet-monitor.config.json", "Config path") + flag.StringVar(&systemName, "name", "", "System Name") + flag.StringVar(&logPath, "log", "", "Log path") + flag.Parse() + + cfg, err := getConfiguration(configPath) + if err != nil { + panic(err) + } + + if len(systemName) > 0 { + cfg.SystemName = systemName + } + if len(logPath) > 0 { + cfg.LogPath = logPath + } + + if len(os.Getenv("CACHET_API")) > 0 { + cfg.APIUrl = os.Getenv("CACHET_API") + } + if len(os.Getenv("CACHET_TOKEN")) > 0 { + cfg.APIToken = os.Getenv("CACHET_TOKEN") + } + + if err := cfg.ValidateConfiguration(); err != nil { + panic(err) + } + + cfg.Logger.Printf("System: %s\nAPI: %s\nMonitors: %d\n\n", cfg.SystemName, cfg.APIUrl, len(cfg.Monitors)) + + wg := &sync.WaitGroup{} + for _, mon := range cfg.Monitors { + go mon.Start(cfg, wg) + } + + signals := make(chan os.Signal, 1) + signal.Notify(signals, os.Interrupt, os.Kill) + <-signals + + cfg.Logger.Println("Abort: Waiting monitors to finish") + for _, mon := range cfg.Monitors { + mon.Stop() + } + + wg.Wait() +} + +func getLogger(logPath string) *log.Logger { + var logWriter = os.Stdout + var err error + + if len(logPath) > 0 { + logWriter, err = os.Create(logPath) + if err != nil { + fmt.Printf("Unable to open file '%v' for logging\n", logPath) + os.Exit(1) + } + } + + flags := log.Llongfile | log.Ldate | log.Ltime + if len(os.Getenv("CACHET_DEV")) > 0 { + flags = 0 + } + + return log.New(logWriter, "", flags) +} + +func getConfiguration(path string) (*cachet.CachetMonitor, error) { + var cfg cachet.CachetMonitor + var data []byte + + // test if its a url + url, err := url.ParseRequestURI(path) + if err == nil && len(url.Scheme) > 0 { + // download config + response, err := http.Get(path) + if err != nil { + return nil, errors.New("Cannot download network config: " + err.Error()) + } + + defer response.Body.Close() + data, _ = ioutil.ReadAll(response.Body) + + fmt.Println("Downloaded network configuration.") + } else { + data, err = ioutil.ReadFile(path) + if err != nil { + return nil, errors.New("Config file '" + path + "' missing!") + } + } + + if err := json.Unmarshal(data, &cfg); err != nil { + fmt.Println(err) + return nil, errors.New("Cannot parse config!") + } + + cfg.Logger = getLogger(cfg.LogPath) + + return &cfg, nil +} diff --git a/config.go b/config.go new file mode 100644 index 0000000..8a12419 --- /dev/null +++ b/config.go @@ -0,0 +1,65 @@ +package cachet + +import ( + "errors" + "log" + "net" + "os" +) + +type CachetMonitor struct { + Logger *log.Logger `json:"-"` + + APIUrl string `json:"api_url"` + APIToken string `json:"api_token"` + SystemName string `json:"system_name"` + LogPath string `json:"log_path"` + InsecureAPI bool `json:"insecure_api"` + + Monitors []*Monitor `json:"monitors"` +} + +func (cfg *CachetMonitor) ValidateConfiguration() error { + if cfg.Logger == nil { + cfg.Logger = log.New(os.Stdout, "", log.Llongfile|log.Ldate|log.Ltime) + } + + if len(cfg.SystemName) == 0 { + // get hostname + cfg.SystemName = getHostname() + } + + if len(cfg.APIToken) == 0 || len(cfg.APIUrl) == 0 { + return errors.New("API URL or API Token not set. cachet-monitor won't be able to report incidents.\n\nPlease set:\n CACHET_API and CACHET_TOKEN environment variable to override settings.\n\nGet help at https://github.com/castawaylabs/cachet-monitor\n") + } + + if len(cfg.Monitors) == 0 { + return errors.New("No monitors defined!\nSee sample configuration: https://github.com/castawaylabs/cachet-monitor/blob/master/example.config.json\n") + } + + for _, monitor := range cfg.Monitors { + if err := monitor.ValidateConfiguration(); err != nil { + return err + } + } + + return nil +} + +// getHostname returns id of the current system +func getHostname() string { + hostname, err := os.Hostname() + if err != nil || len(hostname) == 0 { + addrs, err := net.InterfaceAddrs() + + if err != nil { + return "unknown" + } + + for _, addr := range addrs { + return addr.String() + } + } + + return hostname +} diff --git a/example.config.json b/example.config.json new file mode 100644 index 0000000..81b2c5a --- /dev/null +++ b/example.config.json @@ -0,0 +1,17 @@ +{ + "api_url": "https://demo.cachethq.io/api/v1", + "api_token": "9yMHsdioQosnyVK4iCVR", + "interval": 5, + "monitors": [ + { + "name": "nodegear frontend", + "url": "https://nodegear.io/ping", + "metric_id": 1, + "threshold": 80, + "component_id": null, + "expected_status_code": 200, + "strict_tls": true + } + ], + "insecure_api": false +} diff --git a/http.go b/http.go new file mode 100644 index 0000000..3391174 --- /dev/null +++ b/http.go @@ -0,0 +1,58 @@ +package cachet + +import ( + "bytes" + "crypto/tls" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "strconv" + "time" +) + +func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBody []byte) (*http.Response, []byte, error) { + req, err := http.NewRequest(requestType, monitor.APIUrl+url, bytes.NewBuffer(reqBody)) + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("X-Cachet-Token", monitor.APIToken) + + client := &http.Client{} + if monitor.InsecureAPI == true { + client.Transport = &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + } + } + + res, err := client.Do(req) + if err != nil { + return nil, []byte{}, err + } + + defer res.Body.Close() + body, _ := ioutil.ReadAll(res.Body) + + return res, body, nil +} + +// SendMetric sends lag metric point +func (monitor *Monitor) SendMetric(delay int64) error { + if monitor.MetricID == 0 { + return nil + } + + jsonBytes, _ := json.Marshal(&map[string]interface{}{ + "value": delay, + }) + + resp, _, err := monitor.config.makeRequest("POST", "/metrics/"+strconv.Itoa(monitor.MetricID)+"/points", jsonBytes) + if err != nil || resp.StatusCode != 200 { + return fmt.Errorf("Could not log data point!\n%v\n", err) + } + + return nil +} + +func getMs() int64 { + return time.Now().UnixNano() / int64(time.Millisecond) +} diff --git a/incident.go b/incident.go new file mode 100644 index 0000000..3cbb3c4 --- /dev/null +++ b/incident.go @@ -0,0 +1,114 @@ +package cachet + +import ( + "encoding/json" + "fmt" + "strconv" +) + +// Incident Cachet data model +type Incident struct { + ID int `json:"id"` + Name string `json:"name"` + Message string `json:"message"` + Status int `json:"status"` + Visible int `json"visible"` + Notify bool `json:"notify"` + + ComponentID int `json:"component_id"` + ComponentStatus int `json:"component_status"` +} + +// Send - Create or Update incident +func (incident *Incident) Send(cfg *CachetMonitor) error { + switch incident.Status { + case 1, 2, 3: + // partial outage + incident.ComponentStatus = 3 + + componentStatus, err := incident.GetComponentStatus(cfg) + if componentStatus == 3 { + // major outage + incident.ComponentStatus = 4 + } + + if err != nil { + cfg.Logger.Printf("cannot fetch component: %v", err) + } + case 4: + // fixed + incident.ComponentStatus = 1 + } + + requestType := "POST" + requestURL := "/incidents" + if incident.ID > 0 { + requestType = "PUT" + requestURL += "/" + strconv.Itoa(incident.ID) + } + + jsonBytes, _ := json.Marshal(incident) + + resp, body, err := cfg.makeRequest(requestType, requestURL, jsonBytes) + if err != nil { + return err + } + + var data struct { + Incident struct { + ID int `json:"id"` + } `json:"data"` + } + if err := json.Unmarshal(body, &data); err != nil { + return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body)) + } + + incident.ID = data.Incident.ID + if resp.StatusCode != 200 { + return fmt.Errorf("Could not create/update incident!") + } + + return nil +} + +func (incident *Incident) GetComponentStatus(cfg *CachetMonitor) (int, error) { + resp, body, err := cfg.makeRequest("GET", "/components/"+strconv.Itoa(incident.ComponentID), nil) + if err != nil { + return 0, err + } + + if resp.StatusCode != 200 { + return 0, fmt.Errorf("Invalid status code. Received %d", resp.StatusCode) + } + + var data struct { + Component struct { + Status int `json:"status"` + } `json:"data"` + } + if err := json.Unmarshal(body, &data); err != nil { + return 0, fmt.Errorf("Cannot parse component body: %v. Err = %v", string(body), err) + } + + return data.Component.Status, nil +} + +// SetInvestigating sets status to Investigating +func (incident *Incident) SetInvestigating() { + incident.Status = 1 +} + +// SetIdentified sets status to Identified +func (incident *Incident) SetIdentified() { + incident.Status = 2 +} + +// SetWatching sets status to Watching +func (incident *Incident) SetWatching() { + incident.Status = 3 +} + +// SetFixed sets status to Fixed +func (incident *Incident) SetFixed() { + incident.Status = 4 +} diff --git a/monitor.go b/monitor.go new file mode 100644 index 0000000..fbe7927 --- /dev/null +++ b/monitor.go @@ -0,0 +1,252 @@ +package cachet + +import ( + "crypto/tls" + "errors" + "fmt" + "io/ioutil" + "net/http" + "regexp" + "strconv" + "strings" + "sync" + "time" +) + +const HttpTimeout = time.Duration(time.Second) +const DefaultInterval = 60 +const DefaultTimeFormat = "15:04:05 Jan 2 MST" + +// Monitor data model +type Monitor struct { + Name string `json:"name"` + URL string `json:"url"` + Method string `json:"method"` + StrictTLS bool `json:"strict_tls"` + CheckInterval time.Duration `json:"interval"` + + MetricID int `json:"metric_id"` + ComponentID int `json:"component_id"` + + // Threshold = percentage + Threshold float32 `json:"threshold"` + ExpectedStatusCode int `json:"expected_status_code"` + // compiled to Regexp + ExpectedBody string `json:"expected_body"` + bodyRegexp *regexp.Regexp + + history []bool + lastFailReason string + incident *Incident + config *CachetMonitor + + // Closed when mon.Stop() is called + stopC chan bool +} + +func (mon *Monitor) Start(cfg *CachetMonitor, wg *sync.WaitGroup) { + wg.Add(1) + mon.config = cfg + mon.stopC = make(chan bool) + + mon.config.Logger.Printf(" Starting %s: %d seconds check interval\n - %v %s", mon.Name, mon.CheckInterval, mon.Method, mon.URL) + + // print features + if mon.ExpectedStatusCode > 0 { + mon.config.Logger.Printf(" - Expect HTTP %d", mon.ExpectedStatusCode) + } + if len(mon.ExpectedBody) > 0 { + mon.config.Logger.Printf(" - Expect Body to match \"%v\"", mon.ExpectedBody) + } + if mon.MetricID > 0 { + mon.config.Logger.Printf(" - Log lag to metric id %d\n", mon.MetricID) + } + if mon.ComponentID > 0 { + mon.config.Logger.Printf(" - Update component id %d\n\n", mon.ComponentID) + } + + mon.Tick() + + ticker := time.NewTicker(mon.CheckInterval * time.Second) + for { + select { + case <-ticker.C: + mon.Tick() + case <-mon.stopC: + wg.Done() + return + } + } +} + +func (monitor *Monitor) Stop() { + if monitor.Stopped() { + return + } + + close(monitor.stopC) +} + +func (monitor *Monitor) Stopped() bool { + select { + case <-monitor.stopC: + return true + default: + return false + } +} + +func (monitor *Monitor) Tick() { + reqStart := getMs() + isUp := monitor.doRequest() + lag := getMs() - reqStart + + if len(monitor.history) == 9 { + monitor.config.Logger.Printf("%v is now saturated\n", monitor.Name) + } + if len(monitor.history) >= 10 { + monitor.history = monitor.history[len(monitor.history)-9:] + } + monitor.history = append(monitor.history, isUp) + monitor.AnalyseData() + + if isUp == true && monitor.MetricID > 0 { + monitor.SendMetric(lag) + } +} + +func (monitor *Monitor) doRequest() bool { + client := &http.Client{ + Timeout: HttpTimeout, + } + if monitor.StrictTLS == false { + client.Transport = &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + } + } + + resp, err := client.Get(monitor.URL) + if err != nil { + monitor.lastFailReason = err.Error() + + return false + } + + defer resp.Body.Close() + + if monitor.ExpectedStatusCode > 0 && resp.StatusCode != monitor.ExpectedStatusCode { + monitor.lastFailReason = "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode) + + return false + } + + if monitor.bodyRegexp != nil { + // check body + responseBody, err := ioutil.ReadAll(resp.Body) + if err != nil { + monitor.lastFailReason = err.Error() + + return false + } + + match := monitor.bodyRegexp.Match(responseBody) + if !match { + monitor.lastFailReason = "Unexpected body: " + string(responseBody) + ". Expected to match " + monitor.ExpectedBody + } + + return match + } + + return true +} + +// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident +func (monitor *Monitor) AnalyseData() { + // look at the past few incidents + numDown := 0 + for _, wasUp := range monitor.history { + if wasUp == false { + numDown++ + } + } + + t := (float32(numDown) / float32(len(monitor.history))) * 100 + monitor.config.Logger.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second)) + + if len(monitor.history) != 10 { + // not saturated + return + } + + if t > monitor.Threshold && monitor.incident == nil { + monitor.incident = &Incident{ + Name: monitor.Name + " - " + monitor.config.SystemName, + ComponentID: monitor.ComponentID, + Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat), + Notify: true, + } + + if len(monitor.lastFailReason) > 0 { + monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`" + } + + // is down, create an incident + monitor.config.Logger.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason) + // set investigating status + monitor.incident.SetInvestigating() + // create/update incident + if err := monitor.incident.Send(monitor.config); err != nil { + monitor.config.Logger.Printf("Error sending incident: %v\n", err) + } + } else if t < monitor.Threshold && monitor.incident != nil { + // was down, created an incident, its now ok, make it resolved. + monitor.config.Logger.Printf("%v resolved downtime incident", monitor.Name) + + // resolve incident + monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message + monitor.incident.SetFixed() + monitor.incident.Send(monitor.config) + + monitor.lastFailReason = "" + monitor.incident = nil + } +} + +func (monitor *Monitor) ValidateConfiguration() error { + if len(monitor.ExpectedBody) > 0 { + exp, err := regexp.Compile(monitor.ExpectedBody) + if err != nil { + return err + } + + monitor.bodyRegexp = exp + } + + if len(monitor.ExpectedBody) == 0 && monitor.ExpectedStatusCode == 0 { + return errors.New("Nothing to check, both 'expected_body' and 'expected_status_code' fields empty") + } + + if monitor.CheckInterval < 1 { + monitor.CheckInterval = DefaultInterval + } + + monitor.Method = strings.ToUpper(monitor.Method) + switch monitor.Method { + case "GET", "POST", "DELETE", "OPTIONS", "HEAD": + break + case "": + monitor.Method = "GET" + default: + return fmt.Errorf("Unsupported check method: %v", monitor.Method) + } + + if monitor.ComponentID == 0 && monitor.MetricID == 0 { + return errors.New("component_id & metric_id are unset") + } + + if monitor.Threshold <= 0 { + monitor.Threshold = 100 + } + + return nil +} diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..703696b --- /dev/null +++ b/readme.md @@ -0,0 +1,91 @@ +![screenshot](https://castawaylabs.github.io/cachet-monitor/screenshot.png) + +Features +-------- + +- [x] Creates & Resolves Incidents +- [x] Posts monitor lag to cachet graphs +- [x] Updates Component to Partial Outage +- [x] Updates Component to Major Outage if already in Partial Outage (works well with distributed monitoring) +- [x] Can be run on multiple servers and geo regions + +Configuration +------------- + +``` +{ + // URL for the API. Note: Must end with /api/v1 + "api_url": "https:///api/v1", + // Your API token for Cachet + "api_token": "", + // optional, false default, set if your certificate is self-signed/untrusted + "insecure_api": false, + "monitors": [{ + // required, friendly name for your monitor + "name": "Name of your monitor", + // required, url to probe + "url": "Ping URL", + // optional, http method (defaults GET) + "method": "get", + // self-signed ssl certificate + "strict_tls": true, + // seconds between checks + "interval": 10, + // post lag to cachet metric (graph) + // note either metric ID or component ID are required + "metric_id": , + // post incidents to this component + "component_id": , + // If % of downtime is over this threshold, open an incident + "threshold": 80, + // optional, expected status code (either status code or body must be supplied) + "expected_status_code": 200, + // optional, regular expression to match body content + "expected_body": "P.*NG" + }], + // optional, system name to identify bot (uses hostname by default) + "system_name": "", + // optional, defaults to stdout + "log_path": "" +} +``` + +Installation +------------ + +1. Download binary from [release page](https://github.com/CastawayLabs/cachet-monitor/releases) +2. Create your configuration ([example](https://raw.githubusercontent.com/CastawayLabs/cachet-monitor/master/example.config.json)) +3. `cachet-monitor -c /etc/cachet-monitor.config.json` + +pro tip: run in background using `nohup cachet-monitor 2>&1 > /var/log/cachet-monitor.log &` + +``` +Usage of cachet-monitor: + -c="/etc/cachet-monitor.config.json": Config path + -log="": Log path + -name="": System Name +``` + +Environment variables +--------------------- + +| Name | Example Value | Description | +| ------------ | ------------------------------ | --------------------------- | +| CACHET_API | http://demo.cachethq.io/api/v1 | URL endpoint for cachet api | +| CACHET_TOKEN | APIToken123 | API Authentication token | +| CACHET_DEV | 1 | Strips logging | + +Vision and goals +---------------- + +We made this tool because we felt the need to have our own monitoring software (leveraging on Cachet). +The idea is a stateless program which collects data and pushes it to a central cachet instance. + +This gives us power to have an army of geographically distributed loggers and reveal issues in both latency & downtime on client websites. + +Package usage +------------- + +When using `cachet-monitor` as a package in another program, you should follow what `cli/main.go` does. It is important to call `ValidateConfiguration` on `CachetMonitor` and all the monitors inside. + +[API Documentation](https://godoc.org/github.com/CastawayLabs/cachet-monitor)