20 Commits
v1.1.0 ... v2.0

Author SHA1 Message Date
Matej Kramny
5c4f0c2e69 Print descriptions from cli 2016-05-19 19:34:36 +01:00
Matej Kramny
9d0e176695 Add to readme, remove comment 2016-05-19 19:26:29 +01:00
Matej Kramny
6e9f5440ba Update documentation 2016-05-19 19:18:02 +01:00
Matej Kramny
0ceccccd45 More config options, markdown in incidents
- Specify method to ping with
- Body regex match
- Markdown in incident messages
- Update existing incidents
2016-05-19 18:40:01 +01:00
Matej Kramny
c729fbdf41 Update readme 2016-05-19 12:55:53 +01:00
Matej Kramny
30eea191d3 Merge branch 'Soulou-monitor_interval'
* Soulou-monitor_interval:
  update readme
  Implement 'inteval' config parameter -> number of seconds between checks
2016-05-19 12:53:54 +01:00
Matej Kramny
e0556b56b4 Merge branch 'monitor_interval' of https://github.com/Soulou/cachet-monitor into Soulou-monitor_interval
* 'monitor_interval' of https://github.com/Soulou/cachet-monitor:
  update readme
  Implement 'inteval' config parameter -> number of seconds between checks

# Conflicts:
#	cachet/config.go
#	main.go
#	monitor.go
#	readme.md
2016-05-19 12:53:21 +01:00
Matej Kramny
2d62fc7443 refactoring 2016-05-19 12:20:56 +01:00
Matej Kramny
dfad6f0906 Vision readme 2016-05-19 00:05:00 +01:00
Matej Kramny
267a6cb6b3 better packaging
- update readme
2016-05-18 23:54:55 +01:00
Alan Campbell
025d0c5822 Merge pull request #27 from CastawayLabs/fix-readme
Update go install guide
2016-03-07 16:34:36 -05:00
Alan Campbell
5793df7353 Update go install guide 2016-03-07 16:31:24 -05:00
Matej Kramny
fae03e7561 Merge pull request #10 from studiofrenetic/master
Add time interval
2015-12-08 13:02:12 +00:00
Matej Kramny
bcf53f0afc Merge pull request #17 from Soulou/patch-1
Fix configuration example in README
2015-08-24 00:08:36 +01:00
Soulou
e4a586b92a update readme 2015-08-23 20:01:41 +02:00
Soulou
76b897eb05 Implement 'inteval' config parameter -> number of seconds between checks 2015-08-23 17:33:23 +02:00
Soulou
3c1e2cd452 Fix configuration example in README 2015-08-22 19:13:27 +02:00
Mathieu Doyon
850f4d237b Update interval to 5 secs 2015-07-19 16:23:49 -04:00
Mathieu Doyon
019bc8c057 Fix merge conflict 2015-07-19 16:23:18 -04:00
Mathieu Doyon
f918ea38cd Add time interval 2015-04-08 14:28:36 -04:00
16 changed files with 673 additions and 609 deletions

View File

@@ -1,6 +0,0 @@
FROM golang
ADD . /go/src/github.com/castawaylabs/cachet-monitor
RUN go install github.com/castawaylabs/cachet-monitor
ENTRYPOINT /go/bin/cachet-monitor

View File

@@ -1,20 +0,0 @@
package cachet
import "encoding/json"
// Component Cachet model
type Component struct {
ID json.Number `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Status json.Number `json:"status_id"`
HumanStatus string `json:"-"`
IncidentCount int `json:"-"`
CreatedAt *string `json:"created_at"`
UpdatedAt *string `json:"updated_at"`
}
// ComponentData json response model
type ComponentData struct {
Component Component `json:"data"`
}

View File

@@ -1,117 +0,0 @@
package cachet
import (
"encoding/json"
"flag"
"fmt"
"github.com/castawaylabs/cachet-monitor/system"
"io"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
)
// Static config
var Config CachetConfig
// Central logger
var Logger *log.Logger
// CachetConfig is the monitoring tool configuration
type CachetConfig struct {
APIUrl string `json:"api_url"`
APIToken string `json:"api_token"`
Monitors []*Monitor `json:"monitors"`
SystemName string `json:"system_name"`
LogPath string `json:"log_path"`
InsecureAPI bool `json:"insecure_api"`
}
func init() {
var configPath string
var systemName string
var logPath string
flag.StringVar(&configPath, "c", "/etc/cachet-monitor.config.json", "Config path")
flag.StringVar(&systemName, "name", "", "System Name")
flag.StringVar(&logPath, "log", "", "Log path")
flag.Parse()
var data []byte
// test if its a url
url, err := url.ParseRequestURI(configPath)
if err == nil && len(url.Scheme) > 0 {
// download config
response, err := http.Get(configPath)
if err != nil {
fmt.Printf("Cannot download network config: %v\n", err)
os.Exit(1)
}
defer response.Body.Close()
data, _ = ioutil.ReadAll(response.Body)
fmt.Println("Downloaded network configuration.")
} else {
data, err = ioutil.ReadFile(configPath)
if err != nil {
fmt.Println("Config file '" + configPath + "' missing!")
os.Exit(1)
}
}
err = json.Unmarshal(data, &Config)
if err != nil {
fmt.Println("Cannot parse config!")
os.Exit(1)
}
if len(systemName) > 0 {
Config.SystemName = systemName
}
if len(Config.SystemName) == 0 {
// get hostname
Config.SystemName = system.GetHostname()
}
if len(os.Getenv("CACHET_API")) > 0 {
Config.APIUrl = os.Getenv("CACHET_API")
}
if len(os.Getenv("CACHET_TOKEN")) > 0 {
Config.APIToken = os.Getenv("CACHET_TOKEN")
}
if len(Config.APIToken) == 0 || len(Config.APIUrl) == 0 {
fmt.Printf("API URL or API Token not set. cachet-monitor won't be able to report incidents.\n\nPlease set:\n CACHET_API and CACHET_TOKEN environment variable to override settings.\n\nGet help at https://github.com/CastawayLabs/cachet-monitor\n")
os.Exit(1)
}
if len(Config.Monitors) == 0 {
fmt.Printf("No monitors defined!\nSee sample configuration: https://github.com/CastawayLabs/cachet-monitor/blob/master/example.config.json\n")
os.Exit(1)
}
if len(logPath) > 0 {
Config.LogPath = logPath
}
var logWriter io.Writer
logWriter = os.Stdout
if len(Config.LogPath) > 0 {
logWriter, err = os.Create(Config.LogPath)
if err != nil {
fmt.Printf("Unable to open file '%v' for logging\n", Config.LogPath)
os.Exit(1)
}
}
flags := log.Llongfile | log.Ldate | log.Ltime
if len(os.Getenv("DEVELOPMENT")) > 0 {
flags = 0
}
Logger = log.New(logWriter, "", flags)
}

View File

@@ -1,165 +0,0 @@
package cachet
import (
"encoding/json"
"strconv"
)
// Incident Cachet data model
type Incident struct {
ID json.Number `json:"id"`
Name string `json:"name"`
Message string `json:"message"`
Status json.Number `json:"status"` // 4?
HumanStatus string `json:"human_status"`
Component *Component `json:"-"`
ComponentID *json.Number `json:"component_id"`
CreatedAt *string `json:"created_at"`
UpdatedAt *string `json:"updated_at"`
}
// IncidentData is a response when creating/updating an incident
type IncidentData struct {
Incident Incident `json:"data"`
}
// IncidentList - from API /incidents
type IncidentList struct {
Incidents []Incident `json:"data"`
}
// GetIncidents - Get list of incidents
func GetIncidents() []Incident {
_, body, err := makeRequest("GET", "/incidents", nil)
if err != nil {
Logger.Printf("Cannot get incidents: %v\n", err)
return []Incident{}
}
var data IncidentList
err = json.Unmarshal(body, &data)
if err != nil {
Logger.Printf("Cannot parse incidents: %v\n", err)
panic(err)
}
return data.Incidents
}
// Send - Create or Update incident
func (incident *Incident) Send() {
jsonBytes, _ := json.Marshal(map[string]interface{}{
"name": incident.Name,
"message": incident.Message,
"status": incident.Status,
"component_id": incident.ComponentID,
"notify": true,
})
requestType := "POST"
requestURL := "/incidents"
if len(incident.ID) > 0 {
requestType = "PUT"
requestURL += "/" + string(incident.ID)
}
resp, body, err := makeRequest(requestType, requestURL, jsonBytes)
if err != nil {
Logger.Printf("Cannot create/update incident: %v\n", err)
return
}
Logger.Println(strconv.Itoa(resp.StatusCode) + " " + string(body))
var data IncidentData
err = json.Unmarshal(body, &data)
if err != nil {
Logger.Println("Cannot parse incident body.", string(body))
panic(err)
} else {
incident.ID = data.Incident.ID
incident.Component = data.Incident.Component
}
if resp.StatusCode != 200 {
Logger.Println("Could not create/update incident!")
}
}
func (incident *Incident) fetchComponent() error {
_, body, err := makeRequest("GET", "/components/"+string(*incident.ComponentID), nil)
if err != nil {
return err
}
var data ComponentData
err = json.Unmarshal(body, &data)
if err != nil {
Logger.Println("Cannot parse component body. %v", string(body))
panic(err)
}
incident.Component = &data.Component
return nil
}
func (incident *Incident) UpdateComponent() {
if incident.ComponentID == nil || len(*incident.ComponentID) == 0 {
return
}
if incident.Component == nil {
// fetch component
if err := incident.fetchComponent(); err != nil {
Logger.Printf("Cannot fetch component for incident. %v\n", err)
return
}
}
status, _ := strconv.Atoi(string(incident.Status))
switch status {
case 1, 2, 3:
if incident.Component.Status == "3" {
incident.Component.Status = "4"
} else {
incident.Component.Status = "3"
}
case 4:
incident.Component.Status = "1"
}
jsonBytes, _ := json.Marshal(map[string]interface{}{
"status": incident.Component.Status,
})
resp, _, err := makeRequest("PUT", "/components/"+string(incident.Component.ID), jsonBytes)
if err != nil || resp.StatusCode != 200 {
Logger.Printf("Could not update component: (resp code %d) %v", resp.StatusCode, err)
return
}
}
// SetInvestigating sets status to Investigating
func (incident *Incident) SetInvestigating() {
incident.Status = "1"
incident.HumanStatus = "Investigating"
}
// SetIdentified sets status to Identified
func (incident *Incident) SetIdentified() {
incident.Status = "2"
incident.HumanStatus = "Identified"
}
// SetWatching sets status to Watching
func (incident *Incident) SetWatching() {
incident.Status = "3"
incident.HumanStatus = "Watching"
}
// SetFixed sets status to Fixed
func (incident *Incident) SetFixed() {
incident.Status = "4"
incident.HumanStatus = "Fixed"
}

View File

@@ -1,23 +0,0 @@
package cachet
import (
"encoding/json"
"strconv"
)
// SendMetric sends lag metric point
func SendMetric(metricID int, delay int64) {
if metricID <= 0 {
return
}
jsonBytes, _ := json.Marshal(&map[string]interface{}{
"value": delay,
})
resp, _, err := makeRequest("POST", "/metrics/"+strconv.Itoa(metricID)+"/points", jsonBytes)
if err != nil || resp.StatusCode != 200 {
Logger.Printf("Could not log data point!\n%v\n", err)
return
}
}

View File

@@ -1,133 +0,0 @@
package cachet
import (
"crypto/tls"
"encoding/json"
"net/http"
"strconv"
"time"
)
const timeout = time.Duration(time.Second)
// Monitor data model
type Monitor struct {
Name string `json:"name"`
URL string `json:"url"`
MetricID int `json:"metric_id"`
Threshold float32 `json:"threshold"`
ComponentID *int `json:"component_id"`
ExpectedStatusCode int `json:"expected_status_code"`
StrictTLS *bool `json:"strict_tls"`
History []bool `json:"-"`
LastFailReason *string `json:"-"`
Incident *Incident `json:"-"`
}
// Run loop
func (monitor *Monitor) Run() {
reqStart := getMs()
isUp := monitor.doRequest()
lag := getMs() - reqStart
if len(monitor.History) >= 10 {
monitor.History = monitor.History[len(monitor.History)-9:]
}
monitor.History = append(monitor.History, isUp)
monitor.AnalyseData()
if isUp == true && monitor.MetricID > 0 {
SendMetric(monitor.MetricID, lag)
}
}
func (monitor *Monitor) doRequest() bool {
client := &http.Client{
Timeout: timeout,
}
if monitor.StrictTLS != nil && *monitor.StrictTLS == false {
client.Transport = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
}
resp, err := client.Get(monitor.URL)
if err != nil {
errString := err.Error()
monitor.LastFailReason = &errString
return false
}
defer resp.Body.Close()
if resp.StatusCode != monitor.ExpectedStatusCode {
failReason := "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode)
monitor.LastFailReason = &failReason
return false
}
return true
}
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
func (monitor *Monitor) AnalyseData() {
// look at the past few incidents
numDown := 0
for _, wasUp := range monitor.History {
if wasUp == false {
numDown++
}
}
t := (float32(numDown) / float32(len(monitor.History))) * 100
Logger.Printf("%s %.2f%% Down at %v. Threshold: %.2f%%\n", monitor.URL, t, time.Now().UnixNano()/int64(time.Second), monitor.Threshold)
if len(monitor.History) != 10 {
// not enough data
return
}
if t > monitor.Threshold && monitor.Incident == nil {
// is down, create an incident
Logger.Println("Creating incident...")
component_id := json.Number(strconv.Itoa(*monitor.ComponentID))
monitor.Incident = &Incident{
Name: monitor.Name + " - " + Config.SystemName,
Message: monitor.Name + " check failed",
ComponentID: &component_id,
}
if monitor.LastFailReason != nil {
monitor.Incident.Message += "\n\n - " + *monitor.LastFailReason
}
// set investigating status
monitor.Incident.SetInvestigating()
// create/update incident
monitor.Incident.Send()
monitor.Incident.UpdateComponent()
} else if t < monitor.Threshold && monitor.Incident != nil {
// was down, created an incident, its now ok, make it resolved.
Logger.Println("Updating incident to resolved...")
component_id := json.Number(strconv.Itoa(*monitor.ComponentID))
monitor.Incident = &Incident{
Name: monitor.Incident.Name,
Message: monitor.Name + " check succeeded",
ComponentID: &component_id,
}
monitor.Incident.SetFixed()
monitor.Incident.Send()
monitor.Incident.UpdateComponent()
monitor.Incident = nil
}
}
func getMs() int64 {
return time.Now().UnixNano() / int64(time.Millisecond)
}

View File

@@ -1,32 +0,0 @@
package cachet
import (
"bytes"
"crypto/tls"
"io/ioutil"
"net/http"
)
func makeRequest(requestType string, url string, reqBody []byte) (*http.Response, []byte, error) {
req, err := http.NewRequest(requestType, Config.APIUrl+url, bytes.NewBuffer(reqBody))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Cachet-Token", Config.APIToken)
client := &http.Client{}
if Config.InsecureAPI == true {
client.Transport = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
}
res, err := client.Do(req)
if err != nil {
return nil, []byte{}, err
}
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
return res, body, nil
}

139
cli/main.go Normal file
View File

@@ -0,0 +1,139 @@
package main
import (
"encoding/json"
"errors"
"flag"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"os/signal"
"sync"
cachet "github.com/castawaylabs/cachet-monitor"
)
var configPath string
var systemName string
var logPath string
func main() {
flag.StringVar(&configPath, "c", "/etc/cachet-monitor.config.json", "Config path")
flag.StringVar(&systemName, "name", "", "System Name")
flag.StringVar(&logPath, "log", "", "Log path")
flag.Parse()
cfg, err := getConfiguration(configPath)
if err != nil {
panic(err)
}
if len(systemName) > 0 {
cfg.SystemName = systemName
}
if len(logPath) > 0 {
cfg.LogPath = logPath
}
if len(os.Getenv("CACHET_API")) > 0 {
cfg.APIUrl = os.Getenv("CACHET_API")
}
if len(os.Getenv("CACHET_TOKEN")) > 0 {
cfg.APIToken = os.Getenv("CACHET_TOKEN")
}
if err := cfg.ValidateConfiguration(); err != nil {
panic(err)
}
cfg.Logger.Printf("System: %s\nAPI: %s\nMonitors: %d\n\n", cfg.SystemName, cfg.APIUrl, len(cfg.Monitors))
wg := &sync.WaitGroup{}
for _, mon := range cfg.Monitors {
cfg.Logger.Printf(" Starting %s: %d seconds check interval\n - %v %s", mon.Name, mon.CheckInterval, mon.Method, mon.URL)
// print features
if mon.ExpectedStatusCode > 0 {
cfg.Logger.Printf(" - Expect HTTP %d", mon.ExpectedStatusCode)
}
if len(mon.ExpectedBody) > 0 {
cfg.Logger.Printf(" - Expect Body to match \"%v\"", mon.ExpectedBody)
}
if mon.MetricID > 0 {
cfg.Logger.Printf(" - Log lag to metric id %d\n", mon.MetricID)
}
if mon.ComponentID > 0 {
cfg.Logger.Printf(" - Update component id %d\n\n", mon.ComponentID)
}
go mon.Start(cfg, wg)
}
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt, os.Kill)
<-signals
cfg.Logger.Println("Abort: Waiting monitors to finish")
for _, mon := range cfg.Monitors {
mon.Stop()
}
wg.Wait()
}
func getLogger(logPath string) *log.Logger {
var logWriter = os.Stdout
var err error
if len(logPath) > 0 {
logWriter, err = os.Create(logPath)
if err != nil {
fmt.Printf("Unable to open file '%v' for logging\n", logPath)
os.Exit(1)
}
}
flags := log.Llongfile | log.Ldate | log.Ltime
if len(os.Getenv("CACHET_DEV")) > 0 {
flags = 0
}
return log.New(logWriter, "", flags)
}
func getConfiguration(path string) (*cachet.CachetMonitor, error) {
var cfg cachet.CachetMonitor
var data []byte
// test if its a url
url, err := url.ParseRequestURI(path)
if err == nil && len(url.Scheme) > 0 {
// download config
response, err := http.Get(path)
if err != nil {
return nil, errors.New("Cannot download network config: " + err.Error())
}
defer response.Body.Close()
data, _ = ioutil.ReadAll(response.Body)
fmt.Println("Downloaded network configuration.")
} else {
data, err = ioutil.ReadFile(path)
if err != nil {
return nil, errors.New("Config file '" + path + "' missing!")
}
}
if err := json.Unmarshal(data, &cfg); err != nil {
fmt.Println(err)
return nil, errors.New("Cannot parse config!")
}
cfg.Logger = getLogger(cfg.LogPath)
return &cfg, nil
}

65
config.go Normal file
View File

@@ -0,0 +1,65 @@
package cachet
import (
"errors"
"log"
"net"
"os"
)
type CachetMonitor struct {
Logger *log.Logger `json:"-"`
APIUrl string `json:"api_url"`
APIToken string `json:"api_token"`
SystemName string `json:"system_name"`
LogPath string `json:"log_path"`
InsecureAPI bool `json:"insecure_api"`
Monitors []*Monitor `json:"monitors"`
}
func (cfg *CachetMonitor) ValidateConfiguration() error {
if cfg.Logger == nil {
cfg.Logger = log.New(os.Stdout, "", log.Llongfile|log.Ldate|log.Ltime)
}
if len(cfg.SystemName) == 0 {
// get hostname
cfg.SystemName = getHostname()
}
if len(cfg.APIToken) == 0 || len(cfg.APIUrl) == 0 {
return errors.New("API URL or API Token not set. cachet-monitor won't be able to report incidents.\n\nPlease set:\n CACHET_API and CACHET_TOKEN environment variable to override settings.\n\nGet help at https://github.com/castawaylabs/cachet-monitor\n")
}
if len(cfg.Monitors) == 0 {
return errors.New("No monitors defined!\nSee sample configuration: https://github.com/castawaylabs/cachet-monitor/blob/master/example.config.json\n")
}
for _, monitor := range cfg.Monitors {
if err := monitor.ValidateConfiguration(); err != nil {
return err
}
}
return nil
}
// getHostname returns id of the current system
func getHostname() string {
hostname, err := os.Hostname()
if err != nil || len(hostname) == 0 {
addrs, err := net.InterfaceAddrs()
if err != nil {
return "unknown"
}
for _, addr := range addrs {
return addr.String()
}
}
return hostname
}

View File

@@ -1,6 +1,7 @@
{
"api_url": "https://demo.cachethq.io/api/v1",
"api_token": "9yMHsdioQosnyVK4iCVR",
"interval": 5,
"monitors": [
{
"name": "nodegear frontend",

58
http.go Normal file
View File

@@ -0,0 +1,58 @@
package cachet
import (
"bytes"
"crypto/tls"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"strconv"
"time"
)
func (monitor *CachetMonitor) makeRequest(requestType string, url string, reqBody []byte) (*http.Response, []byte, error) {
req, err := http.NewRequest(requestType, monitor.APIUrl+url, bytes.NewBuffer(reqBody))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Cachet-Token", monitor.APIToken)
client := &http.Client{}
if monitor.InsecureAPI == true {
client.Transport = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
}
res, err := client.Do(req)
if err != nil {
return nil, []byte{}, err
}
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
return res, body, nil
}
// SendMetric sends lag metric point
func (monitor *Monitor) SendMetric(delay int64) error {
if monitor.MetricID == 0 {
return nil
}
jsonBytes, _ := json.Marshal(&map[string]interface{}{
"value": delay,
})
resp, _, err := monitor.config.makeRequest("POST", "/metrics/"+strconv.Itoa(monitor.MetricID)+"/points", jsonBytes)
if err != nil || resp.StatusCode != 200 {
return fmt.Errorf("Could not log data point!\n%v\n", err)
}
return nil
}
func getMs() int64 {
return time.Now().UnixNano() / int64(time.Millisecond)
}

114
incident.go Normal file
View File

@@ -0,0 +1,114 @@
package cachet
import (
"encoding/json"
"fmt"
"strconv"
)
// Incident Cachet data model
type Incident struct {
ID int `json:"id"`
Name string `json:"name"`
Message string `json:"message"`
Status int `json:"status"`
Visible int `json"visible"`
Notify bool `json:"notify"`
ComponentID int `json:"component_id"`
ComponentStatus int `json:"component_status"`
}
// Send - Create or Update incident
func (incident *Incident) Send(cfg *CachetMonitor) error {
switch incident.Status {
case 1, 2, 3:
// partial outage
incident.ComponentStatus = 3
componentStatus, err := incident.GetComponentStatus(cfg)
if componentStatus == 3 {
// major outage
incident.ComponentStatus = 4
}
if err != nil {
cfg.Logger.Printf("cannot fetch component: %v", err)
}
case 4:
// fixed
incident.ComponentStatus = 1
}
requestType := "POST"
requestURL := "/incidents"
if incident.ID > 0 {
requestType = "PUT"
requestURL += "/" + strconv.Itoa(incident.ID)
}
jsonBytes, _ := json.Marshal(incident)
resp, body, err := cfg.makeRequest(requestType, requestURL, jsonBytes)
if err != nil {
return err
}
var data struct {
Incident struct {
ID int `json:"id"`
} `json:"data"`
}
if err := json.Unmarshal(body, &data); err != nil {
return fmt.Errorf("Cannot parse incident body: %v, %v", err, string(body))
}
incident.ID = data.Incident.ID
if resp.StatusCode != 200 {
return fmt.Errorf("Could not create/update incident!")
}
return nil
}
func (incident *Incident) GetComponentStatus(cfg *CachetMonitor) (int, error) {
resp, body, err := cfg.makeRequest("GET", "/components/"+strconv.Itoa(incident.ComponentID), nil)
if err != nil {
return 0, err
}
if resp.StatusCode != 200 {
return 0, fmt.Errorf("Invalid status code. Received %d", resp.StatusCode)
}
var data struct {
Component struct {
Status int `json:"status"`
} `json:"data"`
}
if err := json.Unmarshal(body, &data); err != nil {
return 0, fmt.Errorf("Cannot parse component body: %v. Err = %v", string(body), err)
}
return data.Component.Status, nil
}
// SetInvestigating sets status to Investigating
func (incident *Incident) SetInvestigating() {
incident.Status = 1
}
// SetIdentified sets status to Identified
func (incident *Incident) SetIdentified() {
incident.Status = 2
}
// SetWatching sets status to Watching
func (incident *Incident) SetWatching() {
incident.Status = 3
}
// SetFixed sets status to Fixed
func (incident *Incident) SetFixed() {
incident.Status = 4
}

29
main.go
View File

@@ -1,29 +0,0 @@
package main
import (
"github.com/castawaylabs/cachet-monitor/cachet"
"time"
)
func main() {
config := cachet.Config
log := cachet.Logger
log.Printf("System: %s, API: %s\n", config.SystemName, config.APIUrl)
log.Printf("Starting %d monitors:\n", len(config.Monitors))
for _, mon := range config.Monitors {
log.Printf(" %s: GET %s & Expect HTTP %d\n", mon.Name, mon.URL, mon.ExpectedStatusCode)
if mon.MetricID > 0 {
log.Printf(" - Logs lag to metric id: %d\n", mon.MetricID)
}
}
log.Println()
ticker := time.NewTicker(time.Second)
for range ticker.C {
for _, mon := range config.Monitors {
go mon.Run()
}
}
}

235
monitor.go Normal file
View File

@@ -0,0 +1,235 @@
package cachet
import (
"crypto/tls"
"errors"
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"
)
const HttpTimeout = time.Duration(time.Second)
const DefaultInterval = 60
const DefaultTimeFormat = "15:04:05 Jan 2 MST"
// Monitor data model
type Monitor struct {
Name string `json:"name"`
URL string `json:"url"`
Method string `json:"method"`
StrictTLS bool `json:"strict_tls"`
CheckInterval time.Duration `json:"interval"`
MetricID int `json:"metric_id"`
ComponentID int `json:"component_id"`
// Threshold = percentage
Threshold float32 `json:"threshold"`
ExpectedStatusCode int `json:"expected_status_code"`
// compiled to Regexp
ExpectedBody string `json:"expected_body"`
bodyRegexp *regexp.Regexp
history []bool
lastFailReason string
incident *Incident
config *CachetMonitor
// Closed when mon.Stop() is called
stopC chan bool
}
func (mon *Monitor) Start(cfg *CachetMonitor, wg *sync.WaitGroup) {
wg.Add(1)
mon.config = cfg
mon.stopC = make(chan bool)
mon.Tick()
ticker := time.NewTicker(mon.CheckInterval * time.Second)
for {
select {
case <-ticker.C:
mon.Tick()
case <-mon.stopC:
wg.Done()
return
}
}
}
func (monitor *Monitor) Stop() {
if monitor.Stopped() {
return
}
close(monitor.stopC)
}
func (monitor *Monitor) Stopped() bool {
select {
case <-monitor.stopC:
return true
default:
return false
}
}
func (monitor *Monitor) Tick() {
reqStart := getMs()
isUp := monitor.doRequest()
lag := getMs() - reqStart
if len(monitor.history) == 9 {
monitor.config.Logger.Printf("%v is now saturated\n", monitor.Name)
}
if len(monitor.history) >= 10 {
monitor.history = monitor.history[len(monitor.history)-9:]
}
monitor.history = append(monitor.history, isUp)
monitor.AnalyseData()
if isUp == true && monitor.MetricID > 0 {
monitor.SendMetric(lag)
}
}
func (monitor *Monitor) doRequest() bool {
client := &http.Client{
Timeout: HttpTimeout,
}
if monitor.StrictTLS == false {
client.Transport = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
}
resp, err := client.Get(monitor.URL)
if err != nil {
monitor.lastFailReason = err.Error()
return false
}
defer resp.Body.Close()
if monitor.ExpectedStatusCode > 0 && resp.StatusCode != monitor.ExpectedStatusCode {
monitor.lastFailReason = "Unexpected response code: " + strconv.Itoa(resp.StatusCode) + ". Expected " + strconv.Itoa(monitor.ExpectedStatusCode)
return false
}
if monitor.bodyRegexp != nil {
// check body
responseBody, err := ioutil.ReadAll(resp.Body)
if err != nil {
monitor.lastFailReason = err.Error()
return false
}
match := monitor.bodyRegexp.Match(responseBody)
if !match {
monitor.lastFailReason = "Unexpected body: " + string(responseBody) + ". Expected to match " + monitor.ExpectedBody
}
return match
}
return true
}
// AnalyseData decides if the monitor is statistically up or down and creates / resolves an incident
func (monitor *Monitor) AnalyseData() {
// look at the past few incidents
numDown := 0
for _, wasUp := range monitor.history {
if wasUp == false {
numDown++
}
}
t := (float32(numDown) / float32(len(monitor.history))) * 100
monitor.config.Logger.Printf("%s %.2f%%/%.2f%% down at %v\n", monitor.Name, t, monitor.Threshold, time.Now().UnixNano()/int64(time.Second))
if len(monitor.history) != 10 {
// not saturated
return
}
if t > monitor.Threshold && monitor.incident == nil {
monitor.incident = &Incident{
Name: monitor.Name + " - " + monitor.config.SystemName,
ComponentID: monitor.ComponentID,
Message: monitor.Name + " check **failed** - " + time.Now().Format(DefaultTimeFormat),
Notify: true,
}
if len(monitor.lastFailReason) > 0 {
monitor.incident.Message += "\n\n `" + monitor.lastFailReason + "`"
}
// is down, create an incident
monitor.config.Logger.Printf("%v creating incident. Monitor is down: %v", monitor.Name, monitor.lastFailReason)
// set investigating status
monitor.incident.SetInvestigating()
// create/update incident
if err := monitor.incident.Send(monitor.config); err != nil {
monitor.config.Logger.Printf("Error sending incident: %v\n", err)
}
} else if t < monitor.Threshold && monitor.incident != nil {
// was down, created an incident, its now ok, make it resolved.
monitor.config.Logger.Printf("%v resolved downtime incident", monitor.Name)
// resolve incident
monitor.incident.Message = "\n**Resolved** - " + time.Now().Format(DefaultTimeFormat) + "\n\n - - - \n\n" + monitor.incident.Message
monitor.incident.SetFixed()
monitor.incident.Send(monitor.config)
monitor.lastFailReason = ""
monitor.incident = nil
}
}
func (monitor *Monitor) ValidateConfiguration() error {
if len(monitor.ExpectedBody) > 0 {
exp, err := regexp.Compile(monitor.ExpectedBody)
if err != nil {
return err
}
monitor.bodyRegexp = exp
}
if len(monitor.ExpectedBody) == 0 && monitor.ExpectedStatusCode == 0 {
return errors.New("Nothing to check, both 'expected_body' and 'expected_status_code' fields empty")
}
if monitor.CheckInterval < 1 {
monitor.CheckInterval = DefaultInterval
}
monitor.Method = strings.ToUpper(monitor.Method)
switch monitor.Method {
case "GET", "POST", "DELETE", "OPTIONS", "HEAD":
break
case "":
monitor.Method = "GET"
default:
return fmt.Errorf("Unsupported check method: %v", monitor.Method)
}
if monitor.ComponentID == 0 && monitor.MetricID == 0 {
return errors.New("component_id & metric_id are unset")
}
if monitor.Threshold <= 0 {
monitor.Threshold = 100
}
return nil
}

121
readme.md
View File

@@ -1,78 +1,63 @@
Cachet Monitor plugin
=====================
This is a monitoring plugin for CachetHQ.
![screenshot](https://castawaylabs.github.io/cachet-monitor/screenshot.png)
Features
--------
- [x] Creates & Resolves Incidents
- [x] Posts monitor lag every second
- [x] Posts monitor lag to cachet graphs
- [x] Updates Component to Partial Outage
- [x] Updates Component to Major Outage if in Partial Outage
- [x] Updates Component to Major Outage if already in Partial Outage (works well with distributed monitoring)
- [x] Can be run on multiple servers and geo regions
Docker Quickstart
-----------------
1. Create a configuration json
2.
```
docker run -d \
--name cachet-monitor \
-h cachet-monitor \
-v `pwd`/config.json:/etc/cachet-monitor.config.json \
castawaylabs/cachet-monitor
```
Configuration
-------------
```
{
"api_url": "https://demo.cachethq.io/api/v1",
"api_token": "9yMHsdioQosnyVK4iCVR",
"monitors": [
{
"name": "nodegear frontend",
"url": "https://nodegear.io/ping",
"metric_id": 0,
"component_id": 0,
"threshold": 80,
"component_id": null,
"expected_status_code": 200,
"strict_tls": true
}
],
"insecure_api": false
// URL for the API. Note: Must end with /api/v1
"api_url": "https://<cachet domain>/api/v1",
// Your API token for Cachet
"api_token": "<cachet api token>",
// optional, false default, set if your certificate is self-signed/untrusted
"insecure_api": false,
"monitors": [{
// required, friendly name for your monitor
"name": "Name of your monitor",
// required, url to probe
"url": "Ping URL",
// optional, http method (defaults GET)
"method": "get",
// self-signed ssl certificate
"strict_tls": true,
// seconds between checks
"interval": 10,
// post lag to cachet metric (graph)
// note either metric ID or component ID are required
"metric_id": <metric id>,
// post incidents to this component
"component_id": <component id>,
// If % of downtime is over this threshold, open an incident
"threshold": 80,
// optional, expected status code (either status code or body must be supplied)
"expected_status_code": 200,
// optional, regular expression to match body content
"expected_body": "P.*NG"
}],
// optional, system name to identify bot (uses hostname by default)
"system_name": "",
// optional, defaults to stdout
"log_path": ""
}
```
*Notes:*
Installation
------------
- `metric_id` is optional
- `insecure_api` if true it will ignore HTTPS certificate errors (eg if self-signed)
- `strict_tls` if false (true is default) it will ignore HTTPS certificate errors (eg if monitor uses self-signed certificate)
- `component_id` is optional
- `threshold` is a percentage
- `expected_status_code` is a http response code
- GET request will be performed on the `url`
1. Download binary from [release page](https://github.com/CastawayLabs/cachet-monitor/releases)
2. Create your configuration ([example](https://raw.githubusercontent.com/CastawayLabs/cachet-monitor/master/example.config.json))
3. `cachet-monitor -c /etc/cachet-monitor.config.json`
How to run
----------
Example:
1. Set up [Go](https://golang.org)
2. `go install github.com/castawaylabs/cachet-monitor`
3. `cachet-monitor -c https://raw.githubusercontent.com/CastawayLabs/cachet-monitor/master/example.config.json`
Production:
1. Download the example config and save to `/etc/cachet-monitor.config.json`
2. Run in background: `nohup cachet-monitor 2>&1 > /var/log/cachet-monitor.log &`
pro tip: run in background using `nohup cachet-monitor 2>&1 > /var/log/cachet-monitor.log &`
```
Usage of cachet-monitor:
@@ -84,7 +69,23 @@ Usage of cachet-monitor:
Environment variables
---------------------
| Name | Example Value | Description |
| ------------ | --------------------------- | --------------------------- |
| CACHET_API | http://demo.cachethq.io/api | URL endpoint for cachet api |
| CACHET_TOKEN | randomvalue | API Authentication token |
| Name | Example Value | Description |
| ------------ | ------------------------------ | --------------------------- |
| CACHET_API | http://demo.cachethq.io/api/v1 | URL endpoint for cachet api |
| CACHET_TOKEN | APIToken123 | API Authentication token |
| CACHET_DEV | 1 | Strips logging |
Vision and goals
----------------
We made this tool because we felt the need to have our own monitoring software (leveraging on Cachet).
The idea is a stateless program which collects data and pushes it to a central cachet instance.
This gives us power to have an army of geographically distributed loggers and reveal issues in both latency & downtime on client websites.
Package usage
-------------
When using `cachet-monitor` as a package in another program, you should follow what `cli/main.go` does. It is important to call `ValidateConfiguration` on `CachetMonitor` and all the monitors inside.
[API Documentation](https://godoc.org/github.com/CastawayLabs/cachet-monitor)

View File

@@ -1,24 +0,0 @@
package system
import (
"net"
"os"
)
// GetHostname returns id of the current system
func GetHostname() string {
hostname, err := os.Hostname()
if err != nil || len(hostname) == 0 {
addrs, err := net.InterfaceAddrs()
if err != nil {
return "unknown"
}
for _, addr := range addrs {
return addr.String()
}
}
return hostname
}