1:配置prometheus配置文件
vim /path/to/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# 报警管理器配置
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
# 配置报警规则的地址
rule_files:
- /rules_path/to/*.rules
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
# 配置对应的服务
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
# 服务器基础配置
- job_name: "node_exporter"
static_configs:
- targets: ["localhost:9100"]
2: 修改规则文件,采集自己需要的内容
vim /rules_path/to/*.rules
groups:
- name: hostStatsAlert #组名
rules:
# 规则1,CPU使用率。这里因为要做测试,所以设置的时0.3,正常为85%
- alert: hostCpuUsageAlert # alertname
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance) > 0.3 # peomeQL查询语句
for: 1m
labels: # 自定义标签,后面可以作为分组的依据
severity: page
temes: 1
annotations:
summary: "Instance {{ $labels.instance }} CPU usgae high"
description: "{{ $labels.instance }} CPU usage above 30% (current value: {{ $value }})"
# 规则2,内存使用率,正常为85%
- alert: hostMemUsageAlert
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.85
for: 1m
labels:
severity: page
temes: 2
annotations:
summary: "Instance {{ $labels.instance }} MEM usgae high"
description: "{{ $labels.instance }} MEM usage above 85% (current value: {{ $value }})"
# 规则3,磁盘使用率,正常为85%
- alert: disk_usage
expr: 1 - (node_filesystem_files_free{instance=~'localhost:9100',fstype=~"ext.?|xfs"} / node_filesystem_files{instance=~'localhost:9100',fstype=~"ext.?|xfs"} ) > 0.85
for: 1m
labels:
severity: page
annotations:
summary: " {{ $labels.mountpoint }} usage hight more"
description: "{{ $labels.mountpoint }} disk usage above 85% (current value: {{ $value }})"
# 规则4,I/O使用,正常为85%
- alert: I/O Utilization
expr: irate(node_disk_io_time_seconds_total{instance="localhost:9100",device=~"sda"} [1m]) > 0.85
for: 1m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} MEM usgae high"
description: "{{ $labels.instance }} MEM usage above 85% (current value: {{ $value }})"
3:修改alertmanager 配置文件
vim /alertmanager_install_path/to/alertmanager.yml
以下只是邮箱的配置
global:
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '发送人邮箱'
smtp_auth_username: '发送人名字'
smtp_auth_password: '授权码'
# 总规则
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'web.hook'
# 子规则
routes:
- receiver: 'mail' # 接收人
group_wait: 10s # 等待时间
match_re: # 匹配规则
temes: 2 # 自定义标签
- receiver: 'mail1'
group_wait: 10s
match_re:
temes: 1
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'
- name: 'mail' # 接收人名字
email_configs:
- to: '*****@qq.com' # 邮箱地址
- name: 'mail1'
email_configs:
- to: '*****@qq.com'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
4:配置以上信息,配置完成后重启相关服务。
systemctl restart promethues
systemctl restart alertmanager
promethues.service
cat > /etc/systemd/system/promethues.service <<EOF
[Unit]
Description=https://prometheus.io
[Service]
Restart=on-failure
ExecStart=/promethues_instll_path/to/prometheus --config.file=/promethues_instll_path/to/prometheus.yml
[Install]
WantedBy=multi-user.target
EOF
alertmanager.service
cat > /etc/systemd/system/promethues.service <<EOF
[Unit]
Description=alertmanager exporter service
Documentation=https://prometheus.io
After=network.target
[Service]
Type=simple
User=root
Group=root
ExecStart=/alertmanager_install_path/to/alertmanager --config.file=/alertmanager_install_path/to/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF