一、prometheus部署
官网下载安装包解压启动,嫌弃重启麻烦就自己配置自启动
[root@prometheus ~]# tar -zxvf prometheus-2.34.0.linux-amd64.tar.gz -C /opt/software/prometheus
#直接使用默认配置文件启动,先要将prometheus.yml文件配置好 #可以配置自启动
[root@prometheus ~]# nohup ./prometheus --config.file=prometheus.yml > ./prometheus.log 2>&1 &
#配置系统自启动,也可以这样
[root@prometheus ~]# systemctl start alertmanager.service
#端口9090
[root@prometheus ~]# lsof -i:9090
prometheus.yml文件配置,跟我自己的情况配置alertmanager
global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: - 10.5.2.15:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - /opt/software/prometheus/prometheus-2.34.0.linux-amd64/rules/*.yml # - "second_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: #------------------------------------------------------------------------------ # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: "prometheus" # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ["localhost:9090"] #这里配置的各个服务的节点安装的node_exporter,下面有介绍 #大数据节点 - job_name: "bigdata" static_configs: - targets: ['10.5.2.37:9101','10.5.2.38:9101','10.5.2.39:9101','10.5.2.40:9101','10.5.2.41:9101'] #区块链节点 - job_name: "blockchain" static_configs: - targets: ['10.5.2.42:9100','10.5.2.43:9100','10.5.2.44:9100','10.5.2.45:9100'] #压测服务节点 - job_name: "pressuretest" static_configs: - targets: ["10.5.2.17:9101"] #测试环境节点 - job_name: "test" static_configs: - targets: ['10.5.2.14:9101','10.5.2.49:9100'] #办公以及研发类管理服务节点 - job_name: "office" static_configs: - targets: ["10.5.2.48:9101"] #------------------------------------------------------------------------------ #生产环境监控 #TDengine1 - job_name: "TDengine1" static_configs: - targets: ["120.26.xx:xx:9101"] #大数据 - job_name: "ProdBigData" static_configs: - targets: ['120.27.xx.xx:9101','121.196.xx.xx:9101','121.196.xx.xx:9101','47.114.xx.xx:9101','121.41.xx.xx:9101'] #生产环境nginx服务器-域名映射 - job_name: "ProdNginx" static_configs: - targets: ["121.196.xx.xx:9101"] #应用服务器1 - job_name: "ProdApplication1" static_configs: - targets: ["47.99.xx.xx:9101"] #中间件服务器和堡垒机 - job_name: "ProdMiddleWareFortress" static_configs: - targets: ["121.43.xx.xx:9101"] #------------------------------------------------------------------------------ #灰度环境服务器 #上海sungrow-lingtan 139服务器 - job_name: "Gray" static_configs: - targets: ["139.224.xx:xx:9101"]
告警rule文件配置,可以配置多个,读取配置在上面的prometheus.yml里面
#服务宕机监控规则
groups:
- name: test #分组名称 唯一键
rules:
- alert: test-down #告警名称
expr: up{job="test"} == 0 # PromQL表达式
for: 5m #最大持续时间
labels:
serverity: 100 # 告警程度
team: test #team分组 Alertmanager对应值分组告警
annotations:
summary: " 告警机器:{{$labels.instance}}: job名称: {{$labels.job}} 宕机 "
下面是组合的rule文件 (因为要分job去监控服务器再进行分组邮件告警推送,所以下列第一个表达式规则就不适用了,需要把表达式直接卸载监控中)
#告警获取的规则表达式
groups:
- name: node_rules
# interval: 15s
rules:
- record: instance:node_cpu_usage
expr: 100 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (nodename)*100
labels:
metric_type: cpu_monitor
- record: instance:node_mem_usage
expr: 100 - (node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes) * 100
labels:
metirc_type: Memory_monitor
#内存和cpu告警规则
groups:
#测试环境
- name: test_node_alerts
rules:
- alert: test_cpu_usage_over_threshold
expr: 100 - avg by(nodename) (irate(node_cpu_seconds_total{job=~"test",mode="idle"}[1m])) * 100 > 80
for: 1m # 等待一秒
labels:
serverity: waring
team: test
annotations:
summary: 主机 {{ $labels.instance }} 的cpu使用率持续1分钟超出阈值,当前为 {{ $value }} %
- alert: test_mem_usage_over_threshold
expr: 100 - (node_memory_MemAvailable_bytes{job=~"test"})/(node_memory_MemTotal_bytes{job=~"test"}) * 100 > 80
for: 1m # 等待一秒
labels:
serverity: waring
team: test
annotations:
summary: 主机 {{ $labels.instance }} 的内存使用率持续1分钟超出阈值,当前为 {{ $value }} %
#灰度环境的cpu和内存监控规则
- name: gray_node_alerts
rules:
- alert: gray_cpu_usage_over_threshold
expr: 100 - avg by(nodename) (irate(node_cpu_seconds_total{job=~"Gray",mode="idle"}[1m])) * 100 > 80
for: 1m # 等待一秒
labels:
serverity: waring
team: Gray
annotations:
summary: 主机 {{ $labels.instance }} 的cpu使用率持续1分钟超出阈值,当前为 {{ $value }} %
- alert: gray_mem_usage_over_threshold
expr: 100 - (node_memory_MemAvailable_bytes{job=~"Gray"})/(node_memory_MemTotal_bytes{job=~"Gray"}) * 100 > 80
for: 1m # 等待一秒
labels:
metirc_type: Memory_monitor
serverity: waring
team: Gray
annotations:
summary: 主机 {{ $labels.instance }} 的内存使用率持续1分钟超出阈值,当前为 {{ $value }} %
#大数据
磁盘告警规则
##服务器磁盘监控
groups:
#----------------------------------生成环境--------------------------------------------------
- name: ProdApplication1_disk_alerts
rules:
- alert: ProdApplication1_disk_usage_too_High
# node_filesystem_avail_bytes{mountpoint="/"}/node_filesystem_size_bytes{mountpoint="/"}*100 < 10
expr: (node_filesystem_size_bytes{job=~"ProdApplication1",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"}-node_filesystem_free_bytes{job=~"ProdApplication1",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"}) *100/(node_filesystem_avail_bytes {job=~"ProdApplication1",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"}+(node_filesystem_size_bytes{job=~"ProdApplication1",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"}-node_filesystem_free_bytes{job=~"ProdApplication1",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"})) > 90
for: 5m
labels:
severity: warning
team: ProdApplication1
annotations:
summary: "Disk usage is too high on {{ $labels.instance }}"
description: "{{ $labels.instance }} has less than 10%!d(MISSING)isk space available."
#
##生成大数据
- name: ProdBigData_disk_alerts
rules:
- alert: ProdBigData_disk_usage_too_High
expr: (node_filesystem_size_bytes{job=~"ProdBigData",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"}-node_filesystem_free_bytes{job=~"ProdBigData",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"}) *100/(node_filesystem_avail_bytes {job=~"ProdBigData",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"}+(node_filesystem_size_bytes{job=~"ProdBigData",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"}-node_filesystem_free_bytes{job=~"ProdBigData",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker).*"})) > 90
for: 5m
labels:
severity: warning
team: ProdBigData
annotations:
summary: "Disk usage is too high on {{ $labels.instance }}"
description: "{{ $labels.instance }} has less than 10%!d(MISSING)isk space available."
二、alertmanager部署
也是安装包下载,解压,启动
#下载安装包解压到指定目录下面
tar -xf alertmanager-0.24.0.linux-amd64.tar.gz -C /opt/software/alertmanager
#解压后进入文件目录找到alertmanager.yml文件,根据自己需求修改配置文件
vim /opt/software/alertmanager/alertmanager-0.24.0.linux-amd64/alertmanager.yml
配置文件
route:
#匹配的分组team,这里为了方便区别,每组team和job我使用相同的名字
group_by: ['test','bigdata','blockchain','pressuretest','fileservice','algo','office','ProdBigData','ProdBlockChain','TDengine1','TestCarbonBrain','ProdNginx','ProdApplication1','ProdMiddleWareFortress']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'test_email' #默认接收邮箱
#配置分组邮件接收
routes:
- receiver: test_email # 对应下面接收邮箱
group_by: [test]
matchers:
- team = test
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
- receiver: bigdata_email
group_by: [bigdata]
matchers:
- team = bigdata
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
- receiver: blockchain_email
group_by: [blockchain]
matchers:
- team = blockchain
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
- receiver: algo_email
group_by: [algo]
matchers:
- team = algo
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#生产环境发送-----------------------------------------------
#大数据
- receiver: bigdata_email
group_by: [ProdBigData]
matchers:
- team = ProdBigData
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#应用服务器2_区块链
- receiver: blockchain_email
group_by: [ProdBlockChain]
matchers:
- team = ProdBlockChain
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#TDengine1
- receiver: test_email
group_by: [TDengine1]
matchers:
- team = TDengine1
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#慧碳大脑测试服务器
- receiver: test_email
group_by: [TestCarbonBrain]
matchers:
- team = TestCarbonBrain
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#windows跳板机
- receiver: test_email
group_by: [ProdWinJump]
matchers:
- team = ProdWinJump
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#生产环境nginx服务器-域名映射
- receiver: test_email
group_by: [ProdNginx]
matchers:
- team = ProdNginx
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#应用服务器1
- receiver: test_email
group_by: [ProdApplication1]
matchers:
- team = ProdApplication1
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#中间件服务器和堡垒机
- receiver: test_email
group_by: [ProdMiddleWareFortress]
matchers:
- team = ProdMiddleWareFortress
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#应用服务器1
- receiver: test_email
group_by: [Gray]
matchers:
- team = Gray
group_interval: 10s
group_wait: 30s
repeat_interval: 60m
#接收邮箱配置
receivers:
- name: test_email #测试邮件收件箱
email_configs:
- to: '接收邮箱'
html: '{{ template "email.to.html" . }}'
send_resolved: true
- name: bigdata_email #大数据接收邮箱
email_configs:
- to: '接收邮箱'
html: '{{ template "email.to.html" . }}'
send_resolved: true
- name: blockchain_email #区块链接收邮箱
email_configs:
- to: '接收邮箱'
html: '{{ template "email.to.html" . }}'
send_resolved: true
- name: algo_email #算法接收邮箱
email_configs:
- to: '接收邮箱'
html: '{{ template "email.to.html" . }}'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['test','bigdata','blockchain','pressuretest','fileservice','algo','office']
global:
resolve_timeout: 5m
smtp_from: xxxxx@163.com
smtp_auth_username: xxxxx@163.com
smtp_auth_password: xxxxxxxx
smtp_require_tls: false
smtp_smarthost: 'smtp.163.com:465'
#邮件模板
templates:
- '/opt/software/alertmanager/alertmanager-0.24.0.linux-amd64/email.tmpl'
邮件模板(根据自身需要去配置)
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
<h2>@告警通知</h2>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} 级 <br>
告警类型: {{ .Labels.alertname }} <br>
故障主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
<h2>@告警恢复</h2>
告警程序: prometheus_alert <br>
故障主机: {{ .Labels.instance }}<br>
故障主题: {{ .Annotations.summary }}<br>
告警详情: {{ .Annotations.description }}<br>
告警时间: {{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}<br>
恢复时间: {{ .EndsAt.Local.Format "2006-01-02 15:04:05" }}<br>
{{ end }}{{ end -}}
{{- end }}
启动alertmanager
这里是配置自启动配置的方式启动的
systemctl start prometheus.service
访问ip:9093
三、node_exporter安装教程
1、在节点上执行命令下载
wget "https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz"
2、解压压缩包
[root@master node_exporter]# ls
node_exporter-1.5.0.linux-amd64.tar.gz
[root@master node_exporter]# tar -xvf node_exporter-1.5.0.linux-amd64.tar.gz
[root@master node_exporter]# ls
node_exporter-1.5.0.linux-amd64 node_exporter-1.5.0.linux-amd64.tar.gz
二、将node_exporter-1.5.0.linux-amd64目录下的 node_exporter二进制文件复制到 /usr/local/bin路径下
[root@master node_exporter]# cd node_exporter-1.5.0.linux-amd64
[root@master node_exporter-1.5.0.linux-amd64]# ls
LICENSE node_exporter NOTICE
[root@master node_exporter-1.5.0.linux-amd64]# cp node_exporter /usr/local/bin/
[root@master node_exporter-1.5.0.linux-amd64]# ls -l /usr/local/bin/node_exporter
-rwxr-xr-x 1 root root 19779640 Jan 30 10:57 /usr/local/bin/node_exporter
三、创建systemd service文件
根据实际情況修改其中的Environment配置值
如果上一步骤沒有修改创建用户命令中的认证参数
则下文可不做变更
直接复制下文全部文字命令並在节点上执行即可
cat <<EOF > /etc/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
After=network.target
[Service]
ExecStart=/usr/local/bin/node_exporter\
--web.listen-address=:9100\
--collector.systemd\
--collector.systemd.unit-whitelist=(sshd|nginx).service\
--collector.processes\
--collector.tcpstat
[Install]
WantedBy=multi-user.target
EOF
四、重载系统systemd配置
执行命令:
systemctl daemon-reload
五、启动服务并且设置服务自启
[root@master node_exporter]# systemctl enable --now node_exporter
Created symlink from /etc/systemd/system/multi-user.target.wants/node_exporter.service to /etc/systemd/system/node_exporter.service.
六、查看服务运行状态
执行命令
能夠得到类似下文的回显结果
主要看Active属性值
以及日志中提示:
systemctl status node_exporter
active(running)Listening on :9104 systemctl status ode_exporter
● node_exporter.service - node_exporter
Loaded: loaded (/etc/systemd/system/node_exporter.service; enabled; vendor preset: disabled)
Active: active (running) since Mon 2023-01-30 11:02:34 CST; 41s ago
Main PID: 77865 (node_exporter)
CGroup: /system.slice/node_exporter.service
└─77865 /usr/local/bin/node_exporter --web.listen-address=:9100 --collector.systemd --collector.systemd.unit-whitelist=(sshd|nginx).service --collector.processes --collector.tcpstat
七、测试接口
在节点上使用命令访问接口
能夠得到类似下文結果:curl -s {{节点IP}}:9100/metrics
[root@master ~]#curl -s 10.82.42.193:9100/metrics
......
# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served.
# TYPE promhttp_metric_handler_requests_in_flight gauge
promhttp_metric_handler_requests_in_flight 1
# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code.
# TYPE promhttp_metric_handler_requests_total counter
promhttp_metric_handler_requests_total{code="200"} 0
promhttp_metric_handler_requests_total{code="500"} 0
promhttp_metric_handler_requests_total{code="503"} 0
......
或直接使用浏览器访问能夠看到相关 mysqld 实例指标信息则为正常{{节点IP}}:9100/metrics
四、添加和删除监控的机器
1、编辑rule文件添加删除监控的机器
2、编辑prometheus.yml文件,添加删除监控的job(监控的机器)
3、重启alertmanger和prometheus服务,去prometheus服务中查看,是否生效,如果建新的接收邮件方,需要编辑alertmanger.yml文件编辑