1.prometheus安装
1.1 prometheus安装
tar -xzvf sotf/prometheus-2.35.0.linux-amd64.tar.gz -C ./app/
1.2 prometheus.yml配置文件指标说明
global:全局配置
alerting:告警配置
rule_files:告警规则
scrape_configs:配置数据源,称为target,每个target用job_name命
名。又分为静态配置和服务发现
官方文档: https://prometheus.io/docs/prometheus/latest/configuratio
n/configuration/
Prometheus.yml 配置案例
global:
#关联alertmanagers
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
#报警规则文件
rule_files:
- “/data/app/prometheus-2.35.0.linux-amd64/conf/rule/*.yml”
scrape_configs: - job_name: “prometheus”
static_configs:- targets: [“localhost:9090”]
#node-export指标监控
- targets: [“localhost:9090”]
- job_name: “node-export”
file_sd_configs:- files:
- /data/app/prometheus-2.35.0.linux-amd64/conf/node-exporter/*.yml
- files:
黑盒监控-TCP-1
-
job_name: ‘blackbox_tcp’
scrape_interval: 10s
metrics_path: /probe
params:
module: [tcp_connect]
file_sd_configs:- files: [“/data/app/prometheus-2.35.0.linux-amd64/conf/blackbox-exporter-tcp/*.yml”]
relabel_configs: - source_labels: [address]
target_label: __param_target - target_label: address
replacement: 192.168.220.202:9115
报警规则如下:
Cat >/data/app/prometheus-2.35.0.linux-amd64/conf/rule/node_exporter.yml <<EOF
groups: - name: 主机告警
rules:-
alert: CPU使用率过高
expr: round((1 - avg(irate(node_cpu_seconds_total{mode=“idle”}[5m])) by (instance, env, name, group)) * 100, 0.1) > 85
for: 5m
labels:
severity: ‘warning’
annotations:
description: “{{KaTeX parse error: Expected 'EOF', got '}' at position 16: labels.inatance}̲} CPU使用大于85%(目前…value}}%)” -
alert: 内存使用率过高
expr: round((1 - (node_memory_MemAvailable_bytes{name!~“tidb."} / (node_memory_MemTotal_bytes{name!~"tidb.”})))* 100, 0.1) > 85
for: 5m
labels:
severity: ‘warning’
annotations:
description: “{{KaTeX parse error: Expected 'EOF', got '}' at position 19: …els.mountpoint }̲} 内存使用大于85%(目前使…value}}%)” -
alert: 磁盘使用率过高
expr: round(100-(node_filesystem_free_bytes{fstype=“ext4|xfs”,mountpoint!“/etc/hostname|/etc/hosts|/etc/resolv.“}/node_filesystem_size_bytes {fstype=“ext4|xfs”,mountpoint!”/etc/hostname|/etc/hosts|/etc/resolv.”}*100), 0.1) > 3
for: 3s
labels:
severity: ‘warning’
annotations:
description: “{{KaTeX parse error: Expected 'EOF', got '}' at position 19: …els.mountpoint }̲} 磁盘分区使用大于3%(目前…value}}%)” -
alert: 磁盘使用率增长过快
expr: round((node_filesystem_avail_bytes{fstype=“ext4|xfs”,mountpoint!“/etc/hostname|/etc/hosts|/etc/resolv.“} offset 5m - node_filesystem_avail_bytes{fstype=“ext4|xfs”,mountpoint!”/etc/hostname|/etc/hosts|/etc/resolv.”}) / node_filesystem_size_bytes{fstype=“ext4|xfs”,mountpoint!“/etc/hostname|/etc/hosts|/etc/resolv.*”} * 100, 0.1)> 10
for: 1s
labels:
severity: ‘warning’
annotations:
description: “{{KaTeX parse error: Expected 'EOF', got '}' at position 19: …els.mountpoint }̲} 磁盘分区近5分钟磁盘增长率…value}}%)” -
alert: 流入网络带宽过高
expr: round (((sum(rate (node_network_receive_bytes_total{device!~‘tap.|veth.|br.|docker.|virbr*|lo*’}[5m])) by (instance, env, name,)) / 100) / 1024, 0.1) > 500
for: 5m
labels:
severity: ‘warning’
annotations:
description: “{{KaTeX parse error: Expected 'EOF', got '}' at position 19: …els.mountpoint }̲}流入网络带宽持续5分钟高于5…value}}” -
alert: 流出网络带宽过高
expr: round (((sum(rate (node_network_transmit_bytes_total{device!~‘tap.|veth.|br.|docker.|virbr*|lo*’}[5m])) by (instance, env, name)) / 100) / 1024, 0.1) > 500
for: 5m
labels:
severity: ‘warning’
annotations:
description: “{{KaTeX parse error: Expected 'EOF', got '}' at position 19: …els.mountpoint }̲}流出网络带宽持续5分钟高于5…value}}” -
alert: CPU负载大于15
expr: node_load5{job=~“.tidb.”} > 15
for: 5m
labels:
severity: ‘warning’
annotations:
description: “当前机器CPU负载过高,当前值:{{$value}}”
EOF
监控对象如下:
#node-exporter系统监控指标
cat >/data/app/prometheus-2.35.0.linux-amd64/conf/node-exporter/node-exporter.yml <<EOF
-
- files: [“/data/app/prometheus-2.35.0.linux-amd64/conf/blackbox-exporter-tcp/*.yml”]
-
targets: [‘192.168.220.202:19100’]
labels: {
env: ‘fat’,
hostname: ‘test3’,
instance: ‘192.168.220.202’}
-
targets: [‘192.168.220.200:19100’]
labels: {
env: ‘pro’,
hostname: ‘test1’,
instance: ‘192.168.220.200’}
EOF
#黑盒监控nginx端口
cat >/data/app/prometheus-2.35.0.linux-amd64/conf/conf/blackbox-exporter-tcp/nginx.yml <<EOF
- targets: [‘192.168.220.200:80’]
labels: {env: ‘pro’, name: ‘nginx’, hostname: ‘test1’, instance: ‘192.168.220.200:80’, app: “nginx”}
EOF
.3将prometheus加入系统服务
cat > /usr/lib/systemd/system/prometheus.service<< EOF
[Unit]
Description=logging prometheus service
Documentation=https://prometheus.io
[Service]
Type=simple
User=root
Group=root
ExecStart=/data/app/prometheus-2.35.0.linux-amd64/prometheus --config.file=/data/app/prometheus-2.35.0.linux-amd64/prometheus.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
1.4启动服务
systemctl start prometheus
2.node_exporter客户端安装
tar -xzvf node_exporter-1.3.1.linux-amd64.tar.gz -C /data/app
将node_exporter加入系统服务
cat >/usr/lib/systemd/system/node_exporter.service <<EOF
[Unit]
Description=node_exporter
Documentation=node_exporter Monitoring System
After=network.target
[Service]
ExecStart=/data/app/node_exporter-1.3.1.linux-amd64/node_exporter --web.listen-address=:19100
[Install]
WantedBy=multi-user.target
EOF
3.grafana安装
rpm -ivh grafana-enterprise-8.5.2-1.x86_64.rpm
4.AlterManager安装
tar -xzvf alertmanager-0.24.0.linux-amd64.tar.gz -C /data/app
altermanager.yml配置案例
global:
resolve_timeout: 5m
route:
group_by: [‘alertname’, ‘name’]
group_wait: 20s
group_interval: 30s
repeat_interval: 1h
receiver: robot1
routes:
- receiver: ‘robot1’
continue: true
group_wait: 5s
match_re:
env: “prod|pro”
receivers: - name: ‘robot1’
webhook_configs:- url: http://127.0.0.1:8060/dingtalk/webhook1/send #钉钉自定义接口端口
send_resolved: true
- url: http://127.0.0.1:8060/dingtalk/webhook1/send #钉钉自定义接口端口
inhibit_rules:
- source_match:
severity: ‘critical’
target_match:
severity: ‘warning’
equal: [‘alertname’, ‘dev’, ‘instance’]
将altermanager加入系统服务
cat >/usr/lib/systemd/system/alertmanager.service <<EOF
[Unit]
Description=https://prometheus.io
[Service]
Restart=on-failure
ExecStart=/data/app/alertmanager-0.24.0.linux-amd64/alertmanager --config.file=/data/app/alertmanager-0.24.0.linux-amd64/alertmanager.yml
[Install]
WantedBy=multi-user.target
EOF
5.prometheus-webhook-dingtalk钉钉接口报警
tar -xzvf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz -C /data/app
config.yml 配置案例
Request timeout
timeout: 5s
Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true
Customizable templates path
#钉钉自定义报警模板
templates:
- ‘/data/app/alertmanager-0.24.0.linux-amd64/template/ding_alert.tmpl’
You can also override default template using default_message
The following example to use the ‘legacy’ template from v0.3.0
#default_message:
title: ‘{{ template “legacy.title” . }}’
text: ‘{{ template “ding.link.content” . }}’
Targets, previously was known as “profiles”
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=f58dab9d4c9a33795a7218dc3445fed2696bd6cd455be243d3cd8f23fc4b3841
# secret for signature
secret: SEC000000000000000000000
webhook2:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
webhook_legacy:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
# Customize template content
message:
# Use legacy template
title: ‘{{ template “legacy.title” . }}’
text: ‘{{ template “legacy.content” . }}’
webhook_mention_all:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
all: true
webhook_mention_users:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
mobiles: [‘156xxxx8827’, ‘189xxxx8325’]
钉钉自定义告警模板
vi /data/app/alertmanager-0.24.0.linux-amd64/template/ding_alert.tmpl
警程序:prometheus_alert
告警级别:{{ .Labels.severity }}
告警类型:{{ .Labels.alertname }}
主机IP: {{ .Labels.instance }}
主机名: {{ .Labels.hostname }}
env: {{ .Labels.env }}
告警描叙: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format “2006-01-02 15:04:05” }}
{{ end }}{{ end }}
{{ define “__text_resolve_list” }}{{ range . }}
恢复程序:prometheus_alert
恢复类型:{{ .Labels.alertname }}
主机IP: {{ .Labels.instance }}
主机名: {{ .Labels.hostname }}
env: {{ .Labels.env }}
恢复描叙: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format “2006-01-02 15:04:05” }}
{{ end }}{{ end }}
{{ define “ding.link.content” }}
{{ if gt (len .Alerts.Firing) 0 -}}
告警列表:
{{ template “__text_alert_list” .Alerts.Firing }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
恢复列表:
{{ template “__text_resolve_list” .Alerts.Resolved }}
{{- end }}
{{- end }}
将/prometheus-webhook-dingtalk加入系统服务
[Unit]
Description=https://github.com/timonwong/prometheus-webhook-dingtalk/releases/
After=network-online.target
[Service]
Restart=on-failure
ExecStart=/data/app/prometheus-webhook-dingtalk-2.1.0.linux-amd64/prometheus-webhook-dingtalk --config.file=/data/app/prometheus-webhook-dingtalk-2.1.0.linux-amd64/config.yml
[Install]
WantedBy=multi-user.target
6.黑盒监控blackbox_exporter安装
tar -xzvf blackbox_exporter-0.21.0-rc.0.linux-amd64.tar.gz -C /data/app
cat /data/app/prometheus-2.35.0.linux-amd64/conf/rule/blackbox.yml <<EOF
groups:
- name: 黑盒探测
rules:
- alert: 站点连接异常
expr: probe_success{app=“java|nodejs|golang|nginx”,env=“pro|prod”} != 1
for: 120s
labels:
severity: ‘critical’
annotations:
description: "{{$labels.mountpoint }} "
EOF
将blackbox_exporter加入系统服务
cat >/usr/lib/systemd/system/blackbox_exporter.service <<EOF
[Unit]
Description=blackbox_exporter
After=network.target
[Service]
WorkingDirectory=/data/app/blackbox_exporter
ExecStart=/data/app/blackbox_exporter/blackbox_exporter
–config.file=/data/app/blackbox_exporter/blackbox.yml
[Install]
WantedBy=multi-user.target
EOF
7.钉钉机器人设置
1.设置允许外网IP
效果如下: