一、下载相关部署环境安装包
go
prometheus
grafana-7.5.4-1
node_exporter
alertmanager
process_exporter
二、部署go环境
[root@lzy prom]# tar zxvf go1.10.3.linux-amd64.tar.gz
[root@lzy prom]# mv go /usr/local/go
[root@lzy prom]# vim /etc/profile
## 末尾添加
export GO_HOME=/usr/local/go
export PATH=$GO_HOME/bin:$PATH
[root@lzy prom]# source /etc/profile
[root@lzy prom]# go version
go version go1.10.3 linux/amd64
三、部署Prometheus
[root@lzy prom]# tar zxvf prometheus-2.5.0.linux-amd64.tar.gz
[root@lzy prom]# mv prometheus-2.5.0.linux-amd64 /usr/local/prometheus
[root@lzy prom]# cd /usr/local/prometheus/
设置开机自启动
[root@lzy prometheus]# vim /usr/lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml
[Install]
WantedBy=multi-user.target
[root@lzy prometheus]# systemctl daemon-reload
[root@lzy prometheus]# systemctl start prometheus.service
[root@lzy prometheus]# systemctl enable prometheus.service
游览器访问:http://192.168.1.203:9090
四、部署node_exporter监控主机信息
[root@lzy prom]# mkdir /usr/local/prometheus/exporter //方便存放插件
[root@lzy prom]# tar zxvf node_exporter-0.17.0.linux-amd64.tar.gz
[root@lzy prom]# mv node_exporter-0.17.0.linux-amd64 /usr/local/prometheus/exporter/node_exporter
[root@lzy prom]# cd /usr/local/prometheus/exporter/node_exporter/
设置开机自启动
[root@lzy node_exporter]# vim /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/prometheus/exporter/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target
[root@lzy node_exporter]# systemctl daemon-reload
[root@lzy node_exporter]# systemctl start node_exporter.service
[root@lzy node_exporter]# systemctl enable node_exporter.service
五、部署process_exporter监控服务进程
[root@lzy prom]# wget https://github.com/ncabatoff/process-exporter/releases/download/v0.7.10/process-exporter-0.7.10.linux-amd64.tar.gz
[root@lzy prom]# tar -xf process-exporter-0.7.10.linux-amd64.tar.gz
[root@lzy prom]# mv process-exporter-0.7.10.linux-amd64 /usr/local/prometheus/exporter/process_exporter
[root@lzy prom]# cd /usr/local/prometheus/exporter/process_exporter
修改配置文件
[root@lzy process_exporter]# vim process-conf.yaml
## 添加
process_names:
- name: "{{.Matches}}"
cmdline:
- 'prometheus' //进程名
- name: "{{.Matches}}"
cmdline:
- 'etcd' //进程名
- name: "{{.Matches}}"
cmdline:
- '.+' //监控所有进程
设置开机自启动
[root@lzy process_exporter]# vim /usr/lib/systemd/system/process_exporter.service
[Unit]
Description=Prometheus exporter for processors metrics, written in Go with pluggable metric collectors.
Documentation=https://github.com/ncabatoff/process-exporter
After=network.target
[Service]
Type=simple
WorkingDirectory=/usr/local/prometheus/exporter/process_exporter
ExecStart=/usr/local/prometheus/exporter/process_exporter/process-exporter -config.path=/usr/local/prometheus/exporter/process_exporter/process-conf.yaml
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@lzy process-exporter]# systemctl daemon-reload
[root@lzy process-exporter]# systemctl start process_exporter.service
[root@lzy process-exporter]# systemctl enable process_exporter.service
至此整个node_exporte和process_exporter 已经启动 并且设置了开启自启,监控了 etcd的服务名。
六、Prometheust配置文件添加exporter
[root@lzy prometheus]# vim prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
rule_files:
- "rules/*.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['192.168.1.203:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['192.168.1.203:9100']
- job_name: 'process-exporter'
static_configs:
- targets: ['192.168.1.203:9256']
labels:
instance: 192.168.1.203-process
重启Prometheus
[root@lzy prometheus]# systemctl restart prometheus.service
游览器访问:http://192.168.1.203:9090/targets
七、部署Grafana
[root@lzy prom]# rpm -ivh grafana-7.5.4-1.x86_64.rpm
[root@lzy prom]# systemctl enable grafana-server.service
[root@lzy prom]# systemctl start grafana-server.service
游览器访问:http://192.168.1.203:3000/login
默认用户名密码:admin/admin 第一次登入会修改密码
八、添加数据源
流程
Configuration—》Add data source—》Prometheus—》(一般情况填写完Name和URL即可了)—》往下拉点击Save&Test
九、添加监控模板
1、主机基础监控
2、服务进程监控
流程
加号—》Import—》填写号码(可以去grafana官网找自己喜欢的)—》Load—》修改Name—》选择prometheus源—》Import
十、编写报警规则
[root@lzy ~]# cd /usr/local/prometheus/
[root@lzy prometheus]# mkdir rules
[root@lzy prometheus]# cd rules/
node_exporter规则
[root@lzy rules]# vim node_exporter.yml
groups:
- name: host-monitoring
rules:
- alert: hostsDown
expr: up == 0
for: 1m
annotations:
summary: "主机:{{ $labels.hostname }},{{ $labels.instance }} 关机"
- alert: 内存使用率
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 1m
labels:
severity: warning
annotations:
summary: "内存使用率>90%"
description: "主机:{{ $labels.hostname }},{{ $labels.instance }},当前值:{{ humanize $value }}"
- alert: CPU使用率
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 90
for: 1m
labels:
severity: warning
annotations:
summary: "CPU使用率>90%"
description: "主机:{{ $labels.hostname }},{{ $labels.instance }},当前值:{{ humanize $value }}"
- alert: 磁盘使用率
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 1m
labels:
severity: warning
annotations:
summary: "磁盘使用率>90%"
description: "主机:{{ $labels.hostname }},{{ $labels.instance }},当前值:{{ humanize $value }}"
process_exporter规则
[root@lzy rules]# vim process_exporter.yml
groups:
- name: Server-monitoring
rules:
- alert: etcd
expr: (namedprocess_namegroup_num_procs{groupname="map[:etcd]"}) == 0 ## map[:一定要写规定的进程名]
for: 30s
labels:
severity: error
annotations:
summary: "{{ $labels.instance }}: etcd进程服务挂了,已经超过30秒"
value: "{{ $value }}"
重启prometheus
[root@lzy prometheus]# systemctl restart prometheus.service
游览器访问:http://192.168.1.203:9090/alerts
十一、部署alertmanager邮件报警
[root@lzy prom]# tar zxvf alertmanager-0.15.3.linux-amd64.tar.gz
[root@lzy prom]# mv alertmanager-0.15.3.linux-amd64 /usr/local/alertmanager
[root@lzy prom]# cd /usr/local/alertmanager/
[root@lzy alertmanager]# vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465'
smtp_from: 'lzy@163.com'
smtp_auth_username: 'lzy@163.com'
smtp_auth_password: 'XXXXXXXXXXXXX' //去网页版的163邮箱获取
smtp_require_tls: false
templates:
- 'template/*.tmpl'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 1m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'lzy@163.com'
html: '{{ template "email.html" . }}' //自定义警报模板,要和模板里面第一行名称对应
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
[root@lzy alertmanager]# mkdir template
[root@lzy alertmanager]# cd template/
编写警报模板
[root@lzy template]# vim main.tmpl
{{ define "email.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
**********告警通知********** <br>
告警类型: {{ $alert.Labels.alertname }} <br>
告警级别: {{ $alert.Labels.severity }} <br>
{{- end }}
===================== <br>
告警主题: {{ $alert.Annotations.summary }} <br>
告警详情: {{ $alert.Annotations.description }} <br>
故障时间: {{ $alert.StartsAt.Local }} <br>
{{ if gt (len $alert.Labels.instance) 0 -}}
故障实例: {{ $alert.Labels.instance }} <br>
{{- end -}}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
**********恢复通知********** <br>
告警类型: {{ $alert.Labels.alertname }} <br>
告警级别: {{ $alert.Labels.severity }} <br>
{{- end }}
===================== <br>
告警主题: {{ $alert.Annotations.summary }} <br>
告警详情: {{ $alert.Annotations.description }} <br>
故障时间: {{ $alert.StartsAt.Local }} <br>
恢复时间: {{ $alert.EndsAt.Local }} <br>
{{ if gt (len $alert.Labels.instance) 0 -}}
故障实例: {{ $alert.Labels.instance }} <br>
{{- end -}}
{{- end }}
{{- end }}
{{- end }}
设置开机自启动
[root@lzy alertmanager]# vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@lzy alertmanager]# systemctl daemon-reload
[root@lzy alertmanager]# systemctl enable alertmanager.service
[root@lzy alertmanager]# systemctl start alertmanager.service
游览器访问:http://192.168.1.203:9093/#/status
十二、测试邮件警报
将内存调整到10%进行测试
[root@lzy prometheus]# vim rules/node_exporter.yml
- alert: 内存使用率
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 90
for: 1m
labels:
severity: warning
annotations:
summary: "内存使用率>10%"
description: "主机:{{ $labels.hostname }},{{ $labels.instance }},当前值:{{ humanize $value }}"
重启prometheus
[root@lzy prometheus]# systemctl restart prometheus.service
游览器访问:http://192.168.1.203:9090/alerts
告警通知
恢复通知
将内存调整正常
停掉etcd程序进行测试
服务挂掉通知
恢复运行通知
将服务正常启动