安装grafana
安装prometheus
安装node_exporter
安装alertmanager
配置微信消息应用(参考我之前写的文章)
只贴出必要得配置文件,不再赘述安装过程
prometheus的配置
/etc/prometheus/prometheus.yml
# my global config
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
external_labels:
monitor: m8_mhxzx_manager
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- "/etc/prometheus/rules.d/*.rule"
scrape_configs:
- job_name: 'prometheus'
scrape_interval: 60s
static_configs:
- targets: ['localhost:9090']
# pay attention to the newline below
- job_name: cluster-scrape-targets
scrape_interval: 60s
honor_labels: true
file_sd_configs:
- files: ['/etc/prometheus/cluster.json']
cluster.json
[
{
"labels": {
"app": "test",
"cluster": "China",
"name": "localhost",
"region": "Beijing"
},
"targets": [
"127.0.0.1:9100"
]
}
]
grafana图形配置
{
"alert": {
"alertRuleTags": {},
"conditions": [
{
"evaluator": {
"params": [
0.5
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A",
"5m",
"now"
]
},
"reducer": {
"params": [],
"type": "max"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"for": "0",
"frequency": "30s",
"handler": 1,
"message": "负载异常",
"name": "load alert",
"noDataState": "no_data",
"notifications": []
},
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"percentage": false,
"pluginVersion": "7.1.1",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "node_load1",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"thresholds": [
{
"colorMode": "critical",
"fill": true,
"line": true,
"op": "gt",
"value": 0.5
}
],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "load",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
},
"datasource": null
}
消息就是负载异常
alert channel
alertmanager的配置
/etc/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
templates:
- '/etc/alertmanager/wechat.tmpl'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 1m
repeat_interval: 12h
receiver: 'wechat'
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: 'xxxxxx'
agent_id: '1000002'
api_secret: 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
to_user: '@all'
send_resolved: true
/etc/alertmanager/wechat.tmpl
{{ define "grafana_alert.default.message" }}
{{ range .Alerts }}
故障主题: {{ .Annotations.summary }}
故障大区: {{ .Labels.cluster }}
故障主机: {{ .Labels.name }}
故障详情: {{ .Annotations.description }}
报警时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}
{{ define "grafana_recovery.default.message" }}
{{ range .Alerts }}
故障恢复: {{ .Annotations.summary }}
故障大区: {{ .Labels.cluster }}
故障主机: {{ .Labels.name }}
恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}
{{ define "wechat.default.message" }}
{{ if eq .Status "firing"}}
[Warning]:
{{ template "grafana_alert.default.message" . }}
{{ end }}
{{ if eq .Status "resolved" }}
[Recovery]:
{{ template "grafana_recovery.default.message" . }}{{ end }}
{{ end }}
测试
增加负载
dd if=/dev/zero of=/dev/null
微信消息