Merge branch 'master' into gh-pages

* master: (40 commits)
  Add to readme, remove comment
  Update documentation
  More config options, markdown in incidents
  Update readme
  refactoring
  Vision readme
  better packaging
  Update go install guide
  update readme
  Implement 'inteval' config parameter -> number of seconds between checks
  Fix configuration example in README
  Update documentation for api url
  Fix v1 api url
  Add screenshot to readme
  Report new incident and set as fixed
  Update interval to 5 secs
  Update example config with default values
  Improve fail reasons, fix api crashes
  Add time interval
  Improve readme, Dockerfile
  ...
This commit is contained in:
Matej Kramny
2016-05-19 19:30:49 +01:00
8 changed files with 722 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
gin-bin
example.config.local.json

123
cli/main.go Normal file
View File

@@ -0,0 +1,123 @@
package main
import (
"encoding/json"
"errors"
"flag"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"os/signal"
"sync"
cachet "github.com/castawaylabs/cachet-monitor"
)
var configPath string
var systemName string
var logPath string
func main() {
flag.StringVar(&configPath, "c", "/etc/cachet-monitor.config.json", "Config path")
flag.StringVar(&systemName, "name", "", "System Name")
flag.StringVar(&logPath, "log", "", "Log path")
flag.Parse()
cfg, err := getConfiguration(configPath)
if err != nil {
panic(err)
}
if len(systemName) > 0 {
cfg.SystemName = systemName
}
if len(logPath) > 0 {
cfg.LogPath = logPath
}
if len(os.Getenv("CACHET_API")) > 0 {
cfg.APIUrl = os.Getenv("CACHET_API")
}
if len(os.Getenv("CACHET_TOKEN")) > 0 {
cfg.APIToken = os.Getenv("CACHET_TOKEN")
}
if err := cfg.ValidateConfiguration(); err != nil {
panic(err)
}
cfg.Logger.Printf("System: %s\nAPI: %s\nMonitors: %d\n\n", cfg.SystemName, cfg.APIUrl, len(cfg.Monitors))
wg := &sync.WaitGroup{}
for _, mon := range cfg.Monitors {
go mon.Start(cfg, wg)
}
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt, os.Kill)
<-signals
cfg.Logger.Println("Abort: Waiting monitors to finish")
for _, mon := range cfg.Monitors {
mon.Stop()
}
wg.Wait()
}
func getLogger(logPath string) *log.Logger {
var logWriter = os.Stdout
var err error
if len(logPath) > 0 {
logWriter, err = os.Create(logPath)
if err != nil {
fmt.Printf("Unable to open file '%v' for logging\n", logPath)
os.Exit(1)
}
}
flags := log.Llongfile | log.Ldate | log.Ltime
if len(os.Getenv("CACHET_DEV")) > 0 {
flags = 0
}
return log.New(logWriter, "", flags)
}
func getConfiguration(path string) (*cachet.CachetMonitor, error) {
var cfg cachet.CachetMonitor
var data []byte
// test if its a url
url, err := url.ParseRequestURI(path)
if err == nil && len(url.Scheme) > 0 {
// download config
response, err := http.Get(path)
if err != nil {
return nil, errors.New("Cannot download network config: " + err.Error())
}
defer response.Body.Close()
data, _ = ioutil.ReadAll(response.Body)
fmt.Println("Downloaded network configuration.")
} else {
data, err = ioutil.ReadFile(path)
if err != nil {
return nil, errors.New("Config file '" + path + "' missing!")
}
}
if err := json.Unmarshal(data, &cfg); err != nil {
fmt.Println(err)
return nil, errors.New("Cannot parse config!")
}
cfg.Logger = getLogger(cfg.LogPath)
return &cfg, nil
}

65
config.go Normal file
View File

@@ -0,0 +1,65 @@
package cachet
import (
"errors"
"log"
"net"
"os"
)
type CachetMonitor struct {
Logger *log.Logger `json:"-"`
APIUrl string `json:"api_url"`
APIToken string `json:"api_token"`
SystemName string `json:"system_name"`
LogPath string `json:"log_path"`
InsecureAPI bool `json:"insecure_api"`
Monitors []*Monitor `json:"monitors"`
}
func (cfg *CachetMonitor) ValidateConfiguration() error {
if cfg.Logger == nil {
cfg.Logger = log.New(os.Stdout, "", log.Llongfile|log.Ldate|log.Ltime)
}
if len(cfg.SystemName) == 0 {
// get hostname
cfg.SystemName = getHostname()
}
if len(cfg.APIToken) == 0 || len(cfg.APIUrl) == 0 {
return errors.New("API URL or API Token not set. cachet-monitor won't be able to report incidents.\n\nPlease set:\n CACHET_API and CACHET_TOKEN environment variable to override settings.\n\nGet help at https://github.com/castawaylabs/cachet-monitor\n")
}
if len(cfg.Monitors) == 0 {
return errors.New("No monitors defined!\nSee sample configuration: https://github.com/castawaylabs/cachet-monitor/blob/master/example.config.json\n")
}
for _, monitor := range cfg.Monitors {
if err := monitor.ValidateConfiguration(); err != nil {
return err
}
}
return nil
}
// getHostname returns id of the current system
func getHostname() string {
hostname, err := os.Hostname()
if err != nil || len(hostname) == 0 {
addrs, err := net.InterfaceAddrs()
if err != nil {
return "unknown"
}
for _, addr := range addrs {
return addr.String()
}
}
return hostname
}

17
example.config.json Normal file
View File

@@ -0,0 +1,17 @@
{
"api_url": "https://demo.cachethq.io/api/v1",
"api_token": "9yMHsdioQosnyVK4iCVR",
"interval": 5,
"monitors": [
{
"name": "nodegear frontend",
"url": "https://nodegear.io/ping",
"metric_id": 1,
"threshold": 80,
"component_id": null,
"expected_status_code": 200,
"strict_tls": true
}
],
"insecure_api": false
}

58
http.go Normal file
View File

@@ -0,0 +1,58 @@
package cachet
import (
"bytes"
"crypto/tls"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"strconv"
"time"
)
func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBody []byte) (*http.Response, []byte, error) {
req, err := http.NewRequest(requestType, monitor.APIUrl+url, bytes.NewBuffer(reqBody))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Cachet-Token", monitor.APIToken)
client := &http.Client{}
if monitor.InsecureAPI == true {
client.Transport = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
}
res, err := client.Do(req)
if err != nil {
return nil, []byte{}, err
}
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
return res, body, nil
}
// SendMetric sends lag metric point
func (monitor *Monitor) SendMetric(delay int64) error {
if monitor.MetricID == 0 {
return nil
}
jsonBytes, _ := json.Marshal(&map[string]interface{}{
"value": delay,
})
resp, _, err := monitor.config.makeRequest("POST", "/metrics/"+strconv.Itoa(monitor.MetricID)+"/points", jsonBytes)
if err != nil || resp.StatusCode != 200 {
return fmt.Errorf("Could not log data point!\n%v\n", err)
}
return nil
}
func getMs() int64 {
return time.Now().UnixNano() / int64(time.Millisecond)
}

114
incident.go Normal file
View File

@@ -0,0 +1,114 @@
package cachet
import (
"encoding/json"
"fmt"
"strconv"
)
// Incident Cachet data model
type Incident struct {
ID int `json:"id"`
Name string `json:"name"`
Message string `json:"message"`
Status int `json:"status"`
Visible int `json"visible"`
Notify bool `json:"notify"`
ComponentID int `json:"component_id"`
ComponentStatus int `json:"component_status"`
}
// Send - Create or Update incident
func (incident *Incident) Send(cfg *CachetMonitor) error {
switch incident.Status {
case 1, 2, 3:
// partial outage
incident.ComponentStatus = 3
componentStatus, err := incident.GetComponentStatus(cfg)
if componentStatus == 3 {
// major outage
incident.ComponentStatus = 4
}
if err != nil {
cfg.Logger.Printf("cannot fetch component: %v", err)
}
case 4:
// fixed
incident.ComponentStatus = 1
}
requestType := "POST"
requestURL := "/incidents"
if incident.ID > 0 {
requestType = "PUT"
requestURL += "/" + strconv.Itoa(incident.ID)
}
jsonBytes, _ := json.Marshal(incident)
resp, body, err := cfg.makeRequest(requestType, requestURL, jsonBytes)
if err != nil {
return err
}
var data struct {
Incident struct {
ID int `json:"id"`
} `json:"data"`
}
if err := json.Unmarshal(body, &data); err != nil {
return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body))
}
incident.ID = data.Incident.ID
if resp.StatusCode != 200 {
return fmt.Errorf("Could not create/update incident!")
}
return nil
}
func (incident *Incident) GetComponentStatus(cfg *CachetMonitor) (int, error) {
resp, body, err := cfg.makeRequest("GET", "/components/"+strconv.Itoa(incident.ComponentID), nil)
if err != nil {
return 0, err
}
if resp.StatusCode != 200 {
return 0, fmt.Errorf("Invalid status code. Received %d", resp.StatusCode)
}
var data struct {
Component struct {
Status int `json:"status"`
} `json:"data"`
}
if err := json.Unmarshal(body, &data); err != nil {
return 0, fmt.Errorf("Cannot parse component body: %v. Err = %v", string(body), err)
}
return data.Component.Status, nil
}
// SetInvestigating sets status to Investigating
func (incident *Incident) SetInvestigating() {
incident.Status = 1
}
// SetIdentified sets status to Identified
func (incident *Incident) SetIdentified() {
incident.Status = 2
}
// SetWatching sets status to Watching
func (incident *Incident) SetWatching() {
incident.Status = 3
}
// SetFixed sets status to Fixed
func (incident *Incident) SetFixed() {
incident.Status = 4
}

252
monitor.go Normal file
View File

@@ -0,0 +1,252 @@
package cachet
import (
"crypto/tls"
"errors"
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"
)
const HttpTimeout = time.Duration(time.Second)
const DefaultInterval = 60
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
// Monitor data model
type Monitor struct {
Name string `json:"name"`
URL string `json:"url"`
Method string `json:"method"`
StrictTLS bool `json:"strict_tls"`
CheckInterval time.Duration `json:"interval"`
MetricID int `json:"metric_id"`
ComponentID int `json:"component_id"`
// Threshold = percentage
Threshold float32 `json:"threshold"`
ExpectedStatusCode int `json:"expected_status_code"`
// compiled to Regexp
ExpectedBody string `json:"expected_body"`
bodyRegexp *regexp.Regexp
history []bool
lastFailReason string
incident *Incident
config *CachetMonitor
// Closed when mon.Stop() is called
stopC chan bool
}
func (mon *Monitor) Start(cfg *CachetMonitor, wg *sync.WaitGroup) {
wg.Add(1)
mon.config = cfg
mon.stopC = make(chan bool)
mon.config.Logger.Printf(" Starting %s: %d seconds check interval\n - %v %s", mon.Name, mon.CheckInterval, mon.Method, mon.URL)
// print features
if mon.ExpectedStatusCode > 0 {
mon.config.Logger.Printf(" - Expect HTTP %d", mon.ExpectedStatusCode)
}
if len(mon.ExpectedBody) > 0 {
mon.config.Logger.Printf(" - Expect Body to match \"%v\"", mon.ExpectedBody)
}
if mon.MetricID > 0 {
mon.config.Logger.Printf(" - Log lag to metric id %d\n", mon.MetricID)
}
if mon.ComponentID > 0 {
mon.config.Logger.Printf(" - Update component id %d\n\n", mon.ComponentID)
}
mon.Tick()
ticker := time.NewTicker(mon.CheckInterval * time.Second)
for {
select {
case <-ticker.C:
mon.Tick()
case <-mon.stopC:
wg.Done()
return
}
}
}
func (monitor *Monitor) Stop() {
if monitor.Stopped() {
return
}
close(monitor.stopC)
}
func (monitor *Monitor) Stopped() bool {
select {
case <-monitor.stopC:
return true
default:
return false
}
}
func (monitor *Monitor) Tick() {
reqStart := getMs()
isUp := monitor.doRequest()
lag := getMs() - reqStart
if len(monitor.history) == 9 {
monitor.config.Logger.Printf("%v is now saturated\n", monitor.Name)
}
if len(monitor.history) >= 10 {
monitor.history = monitor.history[len(monitor.history)-9:]
}
monitor.history = append(monitor.history, isUp)
monitor.AnalyseData()
if isUp == true && monitor.MetricID > 0 {
monitor.SendMetric(lag)
}
}
func (monitor *Monitor) doRequest() bool {
client := &http.Client{
Timeout: HttpTimeout,
}
if monitor.StrictTLS == false {
client.Transport = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
}
resp, err := client.Get(monitor.URL)
if err != nil {
monitor.lastFailReason = err.Error()
return false
}
defer resp.Body.Close()
if monitor.ExpectedStatusCode > 0 && resp.StatusCode != monitor.ExpectedStatusCode {
monitor.lastFailReason = "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode)
return false
}
if monitor.bodyRegexp != nil {
// check body
responseBody, err := ioutil.ReadAll(resp.Body)
if err != nil {
monitor.lastFailReason = err.Error()
return false
}
match := monitor.bodyRegexp.Match(responseBody)
if !match {
monitor.lastFailReason = "Unexpected body: " + string(responseBody) + ". Expected to match " + monitor.ExpectedBody
}
return match
}
return true
}
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
func (monitor *Monitor) AnalyseData() {
// look at the past few incidents
numDown := 0
for _, wasUp := range monitor.history {
if wasUp == false {
numDown++
}
}
t := (float32(numDown) / float32(len(monitor.history))) * 100
monitor.config.Logger.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second))
if len(monitor.history) != 10 {
// not saturated
return
}
if t > monitor.Threshold && monitor.incident == nil {
monitor.incident = &Incident{
Name: monitor.Name + " - " + monitor.config.SystemName,
ComponentID: monitor.ComponentID,
Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat),
Notify: true,
}
if len(monitor.lastFailReason) > 0 {
monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`"
}
// is down, create an incident
monitor.config.Logger.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
// set investigating status
monitor.incident.SetInvestigating()
// create/update incident
if err := monitor.incident.Send(monitor.config); err != nil {
monitor.config.Logger.Printf("Error sending incident: %v\n", err)
}
} else if t < monitor.Threshold && monitor.incident != nil {
// was down, created an incident, its now ok, make it resolved.
monitor.config.Logger.Printf("%v resolved downtime incident", monitor.Name)
// resolve incident
monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
monitor.incident.SetFixed()
monitor.incident.Send(monitor.config)
monitor.lastFailReason = ""
monitor.incident = nil
}
}
func (monitor *Monitor) ValidateConfiguration() error {
if len(monitor.ExpectedBody) > 0 {
exp, err := regexp.Compile(monitor.ExpectedBody)
if err != nil {
return err
}
monitor.bodyRegexp = exp
}
if len(monitor.ExpectedBody) == 0 && monitor.ExpectedStatusCode == 0 {
return errors.New("Nothing to check, both 'expected_body' and 'expected_status_code' fields empty")
}
if monitor.CheckInterval < 1 {
monitor.CheckInterval = DefaultInterval
}
monitor.Method = strings.ToUpper(monitor.Method)
switch monitor.Method {
case "GET", "POST", "DELETE", "OPTIONS", "HEAD":
break
case "":
monitor.Method = "GET"
default:
return fmt.Errorf("Unsupported check method: %v", monitor.Method)
}
if monitor.ComponentID == 0 && monitor.MetricID == 0 {
return errors.New("component_id & metric_id are unset")
}
if monitor.Threshold <= 0 {
monitor.Threshold = 100
}
return nil
}

91
readme.md Normal file
View File

@@ -0,0 +1,91 @@
![screenshot](https://castawaylabs.github.io/cachet-monitor/screenshot.png)
Features
--------
- [x] Creates & Resolves Incidents
- [x] Posts monitor lag to cachet graphs
- [x] Updates Component to Partial Outage
- [x] Updates Component to Major Outage if already in Partial Outage (works well with distributed monitoring)
- [x] Can be run on multiple servers and geo regions
Configuration
-------------
```
{
// URL for the API. Note: Must end with /api/v1
"api_url": "https://<cachet domain>/api/v1",
// Your API token for Cachet
"api_token": "<cachet api token>",
// optional, false default, set if your certificate is self-signed/untrusted
"insecure_api": false,
"monitors": [{
// required, friendly name for your monitor
"name": "Name of your monitor",
// required, url to probe
"url": "Ping URL",
// optional, http method (defaults GET)
"method": "get",
// self-signed ssl certificate
"strict_tls": true,
// seconds between checks
"interval": 10,
// post lag to cachet metric (graph)
// note either metric ID or component ID are required
"metric_id": <metric id>,
// post incidents to this component
"component_id": <component id>,
// If % of downtime is over this threshold, open an incident
"threshold": 80,
// optional, expected status code (either status code or body must be supplied)
"expected_status_code": 200,
// optional, regular expression to match body content
"expected_body": "P.*NG"
}],
// optional, system name to identify bot (uses hostname by default)
"system_name": "",
// optional, defaults to stdout
"log_path": ""
}
```
Installation
------------
1. Download binary from [release page](https://github.com/CastawayLabs/cachet-monitor/releases)
2. Create your configuration ([example](https://raw.githubusercontent.com/CastawayLabs/cachet-monitor/master/example.config.json))
3. `cachet-monitor -c /etc/cachet-monitor.config.json`
pro tip: run in background using `nohup cachet-monitor 2>&1 > /var/log/cachet-monitor.log &`
```
Usage of cachet-monitor:
-c="/etc/cachet-monitor.config.json": Config path
-log="": Log path
-name="": System Name
```
Environment variables
---------------------
| Name | Example Value | Description |
| ------------ | ------------------------------ | --------------------------- |
| CACHET_API | http://demo.cachethq.io/api/v1 | URL endpoint for cachet api |
| CACHET_TOKEN | APIToken123 | API Authentication token |
| CACHET_DEV | 1 | Strips logging |
Vision and goals
----------------
We made this tool because we felt the need to have our own monitoring software (leveraging on Cachet).
The idea is a stateless program which collects data and pushes it to a central cachet instance.
This gives us power to have an army of geographically distributed loggers and reveal issues in both latency & downtime on client websites.
Package usage
-------------
When using `cachet-monitor` as a package in another program, you should follow what `cli/main.go` does. It is important to call `ValidateConfiguration` on `CachetMonitor` and all the monitors inside.
[API Documentation](https://godoc.org/github.com/CastawayLabs/cachet-monitor)