prometheus监控安装

weixin_42871919

已于 2023-02-07 13:42:43 修改

阅读量142

点赞数

文章标签： prometheus

于 2023-02-07 13:22:17 首次发布

本文链接：https://blog.csdn.net/weixin_42871919/article/details/128913586

版权

4.安装node_exporter（linux服务器相关数据监控，如cpu，内存等）

4.1、安装

5.异常

1.下载资源

1、prometheus

2、alertmanager

3、node_exporter(服务器使用)

4、可离线安装：相关jar和配置点此下载

2.安装prometheus

2.1、安装

#解压prometheus
[root@node2 ~]# tar -xf prometheus-2.42.0.linux-amd64.tar.gz -C /usr/local/

#进入解压路径
[root@node2 ~]# cd /usr/local/

#修改prometheus名称
[root@node2 ~]# mv /usr/local/prometheus-2.42.0.linux-amd64/ /usr/local/prometheus

#进入prometheus目录
[root@node2 ~]# cd /usr/local/prometheus

#添加系统启动服务，指定了配置文件、数据库文件地址，指定了prometheus的端口
[root@node2 ~]# vim /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus Server
After=network-online.target

[Service]
Type=simple
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/usr/local/prometheus/data --web.external-url=http://0.0.0.0:9090 --web.enable-lifecycle
Restart=on-failure

[Install]
WantedBy=multi-user.target


#重写加载systemctl&&启动服务&&设置服务随机自启&&查看服务运行状态&&关闭服务
[root@node2 ~]# systemctl daemon-reload
[root@node2 ~]# systemctl start prometheus
[root@node2 ~]# systemctl enable prometheus
[root@node2 ~]# systemctl status prometheus
[root@node2 ~]# systemctl stop prometheus


#查看服务端口信息
[root@node2 ~]# ss -ntlp | grep prometheus
LISTEN   0    128  :::9090    :::*    users:(("prometheus",pid=58591,fd=8))

还可以通过url更新配置

第一步：首先要保证启动 Prometheus 的时候带上启动参数：--web.enable-lifecycle。
prometheus --config.file=/usr/local/etc/prometheus.yml --web.enable-lifecycle
第二步：去更新我们的 Prometheus 配置。
curl -v --request POST 'http://localhost:9090/-/reload'

2.2、配置文件

1、主配置文件为prometheus.yml，配置文件遵循的是YAML语法格式，整体分为三个模块 global，rule_files，和scrape_configs

2、在/usr/local/prometheus创建rules、targets、data文件夹（相关rules，targets点此下载）

#进入prometheus目录
[root@node2 ~]# cd /usr/local/prometheus

#备份配置文件
[root@node2 ~]# cp prometheus.yml prometheus.yml.bak

#修改配置文件
[root@node2 ~]# vim prometheus.yml
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets: ['localhost:9093'] #收集信息发送到alertmanagers
           #- alertmanager: 9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - 'rules/*_rules.yml' #指定读取路由规则
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]

  #配置2(监听springboot服务成活实例)    
  - job_name: 'spring_actuator_job' #job名称唯一
    metrics_path: '/actuator/prometheus'  #请求路径
    # static_configs: #静态配置，需要重启
    #  - targets: ["172.18.172.242:8016"]
    file_sd_configs: #服务发现，动态配置ip（不需要重启）
      - files: [./targets/springboot_actuator_job_*.yaml]
        refresh_interval: 1m #重新读取文件时间             
  
  #配置3(监听服务器cpu使用率,内存)
  - job_name: "base_all_task_job"
    file_sd_configs: 
      - files: [./targets/base_all_task_job_*.yaml]
        refresh_interval: 5m


#使用promtool检查配置文件是否正常
[root@node2 ~]#./promtool check config prometheus.yml

#重启服务
[root@node2 ~]# systemctl restart prometheus
# 启动普罗米修斯systemctl start prometheus
# 关闭普罗米修斯systemctl stop prometheus

访问测试：

http://192.168.0.30:9090

3.安装Alertmanager

3.1、安装

#解压软件包&&进入解压路径
[root@node2 ~]# tar -xf alertmanager-0.23.0.linux-amd64.tar.gz -C /usr/local

#进入解压路径
[root@node2 ~]# cd /usr/local/

#修改prometheus名称
[root@node2 ~]# mv /usr/local/alertmanager-0.23.0.linux-amd64/ /usr/local/alertmanager

#添加系统启动服务
[root@node2 ~]# vim /etc/systemd/system/alertmanager.service
[Unit]
Description=Alertmanager Server

[Service]
Restart=on-failure
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
WxecReload=/bin/kill -HUP $MAINPID
killMode=process

[Install]
WantedBy=multi-user.target



#启动服务&&设置服务随机自启&&查看服务运行状态
[root@node2 ~]# systemctl start alertmanager
[root@node2 ~]# systemctl enable alertmanager
[root@node2 ~]# systemctl status alertmanager

#查看服务端口信息
[root@prometheus-server alertmanager]# ss -ntlp | grep alertmanager
LISTEN     0  128  :::9093    :::* users:(("alertmanager",pid=8877,fd=8)) #alertmanager监听端口
LISTEN     0  128  :::9094    :::* users:(("alertmanager",pid=6765,fd=3)) #集群服务端口

访问测试：

http://192.168.0.30:9093

3.2、配置文件

1、Alertmanager的配置文件也是遵循YAML格式

2、在/usr/local/alertmanager下新建templates文件夹存放模板(模板点此下载)

3、配置使用企微接收通知

#进入prometheus目录
[root@node2 ~]# cd /usr/local/alertmanager

#备份配置文件
[root@node2 ~]# cp alertmanager.yml alertmanager.yml.bak

#修改配置文件
[root@node2 ~]# vim alertmanager.yml
global:
  resolve_timeout: 3m #每5分钟检测一次是否恢复
  wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' #调用企业微信api地址不用改
templates:     #告警模板
  - './templates/*.tmpl'
  
route: # 设置报警分发策略
  group_by: ['alertname','InstanceAliveAlarm','CpuHighAlarm','MemoryUsageRateAlarm','DiskUsageRateAlarm'] # 分组标签
  group_wait: 30s # 告警等待时间。告警产生后等待10s，如果有同组告警一起发出
  group_interval: 30s # 两组告警的间隔时间
  repeat_interval: 1m # 重复告警的间隔时间，减少相同告警的发送频率 此处为测试设置为1分钟
  receiver: 'default' # 默认接收者
  routes:       # 子路由，子路由可以定义多个
    - receiver: 'wechat_prod'
      # match:      # 通过标签去匹配这次告警是否符合这个路由节点；也可以使用  match_re 进行正则匹配
      #   severity: warning #如果告警级别是严重危害，那么，接收人是微信
      match_re: # match_re 进行正则匹配,如果匹配不到则走默认接收者（receiver: 'default'）
        severity: warning|disaster #同时找到两个标签
        env: prod

    - receiver: 'wechat_test'
      # match:      # 通过标签去匹配这次告警是否符合这个路由节点；也可以使用  match_re 进行正则匹配
      #   severity: warning #如果告警级别是严重危害，那么，接收人是微信
      match_re: # match_re 进行正则匹配,如果匹配不到则走默认接收者（receiver: 'default'）
        severity: warning|disaster #同时找到两个标签 
        env: test     


receivers:
  - name: 'default' #接收者1
    wechat_configs: #微信配置
    - send_resolved: true #开启故障恢复后通知
      agent_id: '1000077'     # 自建应用的agentId
      #to_party: '2'         # 企业微信中创建的接收告警的告警部门ID
      to_user: '17629949087'  # 接收告警消息的人员Id
      api_secret: '1' # 自建应用的secret
      corp_id: '2'  # 企业ID

  #正式
  - name: 'wechat_prod' #接收者2
    wechat_configs: #微信配置
    - send_resolved: true #开启故障恢复后通知
      agent_id: '1000077'     # 自建应用的agentId
      #to_party: '2'         # 企业微信中创建的接收告警的告警部门ID
      to_user: '17629949087'  # 接收告警消息的人员Id
      api_secret: '1' # 自建应用的secret
      corp_id: '2'  # 企业ID      
  #测试
  - name: 'wechat_test' #接收者2
    wechat_configs: #微信配置
    - send_resolved: true #开启故障恢复后通知
      agent_id: '1000074'     # 自建应用的agentId
      #to_party: '2'         # 企业微信中创建的接收告警的告警部门ID
      to_user: '17629949087'  # 接收告警消息的人员Id
      api_secret: '1' # 自建应用的secret
      corp_id: '2'  # 企业ID     

inhibit_rules:   # 告警抑制规则
  - source_match:
      serverity: 'critical'
    target_match:
      serverity: 'warning'
    equal: ['alertname','dev','instance']



#使用amtool检查配置文件是否正常
[root@node2 ~]# ./amtool check-config alertmanager.yml

#重启alertmanager生效
[root@node2 ~]# systemctl restart alertmanager
# systemctl start alertmanager
# systemctl stop alertmanager

4.安装node_exporter（linux服务器相关数据监控，如cpu，内存等）

4.1、安装

#进入prometheus目录
[root@node2 ~]# tar -xzf node_exporter-1.5.0.linux-amd64.tar.gz -C /usr/local

#进入解压路径
[root@node2 ~]# cd /usr/local/

#修改node_exporter名称
[root@node2 ~]# mv /usr/local/node_exporter-1.5.0.linux-amd64/ /usr/local/node_exporter

#添加系统启动服务
[root@node2 ~]# vim /etc/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
After=network.target
#可以创建相应的用户和组 启动
#User=prometheus
#Group=prometheus

[Service]
ExecStart=/usr/local/node_exporter/node_exporter --web.listen-address=:9100
[Install]
WantedBy=multi-user.target

#启动服务&&设置服务随机自启&&查看服务运行状态
[root@node2 ~]# systemctl daemon-reload
[root@node2 ~]# systemctl start node_exporter
[root@node2 ~]# systemctl status node_exporter
[root@node2 ~]# systemctl enable node_exporter

检查

curl http://localhost:9100/metrics

5.异常

如果开启服务的时候遇到Failed to start xxx.service: Unit is masked. 意识是这个服务锁定了，可执行如下命令开启：

systemctl unmask xxx.service

systemctl start xxx.service

systemctl mask xxx.service