Prometheus+grafana+node_exporter+pushgateway+alertmanager企业微信告警

环境:ansible主机一台:10.1.234.11

node主机3台:node1      10.1.234.110      Prometheus+grafana+node_exporter+pushgateway+alertmanager

                         node2       10.1.234.111     node_exporter

                         node3        10.1.234.112     node_exporter

搭建全在ansible主机:其他主机不用登陆:

[root@ansible-11 promethes]# tree
.
├── Alertmanager.yaml
├── Blackbox.yaml
├── conf
│   ├── alertmanager.back.yml
│   ├── alertmanager.service
│   ├── alertmanager.yml
│   ├── blackbox_exporter.service
│   ├── node_exporter.service
│   ├── prometheus.service
│   └── pushgateway.service
├── Grafana.yaml
├── hosts
├── inster_promethes.sh
├── node_exporter.yaml
├── pkg
│   ├── alertmanager-0.20.0.linux-amd64.tar.gz
│   ├── blackbox_exporter-0.16.0.linux-amd64.tar.gz
│   ├── grafana-6.1.3-1.x86_64.rpm
│   ├── node_exporter-1.0.0-rc.1.linux-amd64.tar.gz
│   ├── prometheus-2.8.1.linux-amd64.tar.gz
│   ├── pushgateway-0.4.0.linux-amd64
│   │   ├── LICENSE
│   │   ├── NOTICE
│   │   └── pushgateway
│   └── pushgateway-0.4.0.linux-amd64.tar.gz
├── Prometheus.yaml
├── Pushgateway.yaml
├── reload_Promethes
│   ├── hosts
│   ├── node.yml
│   ├── prometheus.yml
│   ├── prometheus.ymlback
│   └── reload_promethes.yaml
└── template_file
    ├── blackbox-exporter_rev1.json
    ├── MySQL_Overview-1589503416459.json
    ├── Node_Exporter_0.16_0.17_for_Prometheus-1589503429385.json
    └── Redis_Dashboard_for_Prometheus_Redis_Exporter_1.x-1589503441940.json
[root@ansible-11 promethes]# cat Alertmanager.yaml 
---
- hosts: master
  vars: 
  remote_user: root
  gather_facts: false
  
  tasks:
    - name: 分发node_exporter二进制包 
      unarchive: src=pkg/alertmanager-0.20.0.linux-amd64.tar.gz dest=/tmp
    
    - name: 创建文件夹
      file: dest=/usr/local/prometheus  state=directory

    - name: 创建数据目录
      file: dest=/data/prometheus/alertmanager/data state=directory
    
    - name: 创建用户
      user: name=prometheus state=present

    - name: 文件重命名
      shell: mv /tmp/alertmanager-0.20.0.linux-amd64  /usr/local/prometheus/alertmanager

    - name: 把文件划到组
      shell: chown -R prometheus:prometheus /usr/local/prometheus /data/prometheus
    
    - name: 拷贝alertmanager.service启动文件
      copy: src=conf/alertmanager.service dest=/usr/lib/systemd/system/alertmanager.service

    - name: 启动服务并设置开机自启
      systemd: name=alertmanager state=restarted enabled=yes
     
    - name: 拷贝配置文件到master
      copy: src=conf/alertmanager.yml dest=/usr/local/prometheus/alertmanager/alertmanager.yml

    - name: 重启服务
      systemd: name=alertmanager state=restarted 

    - name: 查看状态并将结果注入到alertmanager变量
      shell: ss -nutlp |grep 9093 
      register: alertmanager

    - name: 将结果输出到控制台
      debug: var=alertmanager.stdout_lines
[root@ansible-11 promethes]# cat Blackbox.yaml 
---
- hosts: master
  vars: 
  remote_user: root
  gather_facts: false
  
  tasks:
    - name: 分发blackbox_exporter二进制包 
      unarchive: src=pkg/blackbox_exporter-0.16.0.linux-amd64.tar.gz dest=/tmp

    - name: 创建文件夹
      file: dest=/usr/local/prometheus  state=directory

    - name: 创建用户
      user: name=prometheus state=present

    - name: 文件重命名
      shell: mv /tmp/blackbox_exporter-0.16.0.linux-amd64  /usr/local/prometheus/blackbox_exporter

    - name: 把文件划到组
      shell: chown -R prometheus:prometheus /usr/local/prometheus 

    - name: 拷贝blackbox_exporter.service启动文件
      copy: src=conf/blackbox_exporter.service dest=/usr/lib/systemd/system/blackbox_exporter.service

    - name: 启动服务并设置开机自启
      systemd: name=blackbox_exporter state=restarted enabled=yes

    - name: 查看状态并将结果注入到blackbox_exporter变量
      shell: ss -nutlp |grep 9115 
      register: blackbox_exporter

    - name: 将结果输出到控制台
      debug: var=blackbox_exporter.stdout_lines
[root@ansible-11 promethes]# cat Grafana.yaml 
---
- hosts: master
  vars: 
  remote_user: root
  gather_facts: false
  
  tasks:
    - name: 拷贝grafana rpm文件
      copy: src=pkg/grafana-6.1.3-1.x86_64.rpm dest=/root

    - name: 安装grafana
      yum: name=grafana-6.1.3-1.x86_64.rpm 

    - name: 启动服务并设置开机自启
      systemd: name=grafana-server state=restarted enabled=yes

    - name: 安装grafana所在server安装饼图插件
      shell: grafana-cli plugins install grafana-piechart-panel

    - name: 重启服务
      systemd: name=grafana-server state=restarted 
      

    - name: 查看状态并将结果注入到grafana变量
      shell: ss -nutlp |grep 3000
      register: grafana

    - name: 将结果输出到控制台
      debug: var=grafana.stdout_lines
[root@ansible-11 promethes]# cat hosts 
[master]
10.1.234.110
[node]
10.1.234.110
10.1.234.111
10.1.234.112

 

[root@ansible-11 promethes]# cat node_exporter.yaml 
---
- hosts: node
  vars: 
  remote_user: root
  gather_facts: false
  
  tasks:
    - name: 分发node_exporter二进制包 
      unarchive: src=pkg/node_exporter-1.0.0-rc.1.linux-amd64.tar.gz dest=/tmp

    - name: 创建文件夹
      file: dest=/usr/local/prometheus  state=directory

    - name: 创建用户
      user: name=prometheus state=present
    
    - name: 文件重命名
      shell: mv /tmp/node_exporter-1.0.0-rc.1.linux-amd64  /usr/local/prometheus/node_exporter


    - name: 把文件划到组
      shell: chown -R prometheus:prometheus /usr/local/prometheus 

    - name: 拷贝node_exporter.service启动文件
      copy: src=conf/node_exporter.service dest=/usr/lib/systemd/system/node_exporter.service

    - name: 启动服务并设置开机自启
      systemd: name=node_exporter state=restarted enabled=yes

    - name: 查看状态并将结果注入到node_exporter变量
      shell: ss -nutlp |grep 9100 
      register: node_exporter

    - name: 将结果输出到控制台
      debug: var=node_exporter.stdout_lines
[root@ansible-11 promethes]# cat Prometheus.yaml 
---
- hosts: master
  vars: 
  remote_user: root
  gather_facts: false
  
  tasks:
  - name: 分发prometheus二进制包 
    unarchive: src=pkg/prometheus-2.8.1.linux-amd64.tar.gz dest=/tmp
 
  - name: 创建文件夹
    file: dest=/usr/local/prometheus  state=directory

  - name: 创建数据目录
    file: dest=/data/prometheus/data state=directory

  - name: 文件重命名
    shell: mv /tmp/prometheus-2.8.1.linux-amd64 /usr/local/prometheus/prometheus

  - name: 创建用户
    user: name=prometheus state=present

  - name: 把文件划到组
    shell: chown -R prometheus:prometheus /usr/local/prometheus /data/prometheus

  - name: 拷贝prometheus.service 启动文件
    copy: src=conf/prometheus.service dest=/usr/lib/systemd/system/prometheus.service 

  - name: 启动服务并设置开机自启
    systemd: name=prometheus state=restarted enabled=yes

  - name: 查看状态并将结果注入到prometheus变量
    shell: ss -nutlp |grep 9090 
    register: prometheus

  - name: 将结果输出到控制台
    debug: var=prometheus.stdout_lines
[root@ansible-11 promethes]# cat Pushgateway.yaml 
---
- hosts: master
  vars: 
  remote_user: root
  gather_facts: false
  
  tasks:
  - name: 分发pushgateway二进制包 
    unarchive: src=pkg/pushgateway-0.4.0.linux-amd64.tar.gz dest=/tmp
 
  - name: 创建文件夹
    file: dest=/usr/local/prometheus  state=directory

  - name: 文件重命名
    shell: mv /tmp/pushgateway-0.4.0.linux-amd64 /usr/local/prometheus/pushgateway

  - name: 创建用户
    user: name=prometheus state=present

  - name: 把文件划到组
    shell: chown -R prometheus:prometheus /usr/local/prometheus 

  - name: 拷贝prometheus.service 启动文件
    copy: src=conf/pushgateway.service dest=/usr/lib/systemd/system/pushgateway.service

  - name: 启动服务并设置开机自启
    systemd: name=pushgateway state=restarted enabled=yes

  - name: 查看状态并将结果注入到pushgateway变量
    shell: ss -nutlp |grep 9091 
    register: pushgateway

  - name: 将结果输出到控制台
    debug: var=pushgateway.stdout_lines
[root@ansible-11 conf]# ls
alertmanager.back.yml  alertmanager.service  alertmanager.yml  blackbox_exporter.service  node_exporter.service  prometheus.service  pushgateway.service
[root@ansible-11 reload_Promethes]# cat reload_promethes.yaml 
---
- hosts: master
  gather_facts: no
  tasks:
  - name: 更新配置文件
    copy:
      src: prometheus.yml
      dest: /usr/local/prometheus/prometheus/prometheus.yml

  - name: 创建规则文件
    file: dest=/usr/local/prometheus/prometheus/rules state=directory
   
  - name: 拷贝规则文件
    copy: src=node.yml dest=/usr/local/prometheus/prometheus/rules/node.yml

  - name: 把文件划到组
    shell: chown -R prometheus:prometheus /usr/local/prometheus 

  - name: 重动服务
    systemd: name=prometheus state=restarted 

  - name: 查看状态并将结果注入到prometheus变量
    shell: ss -nutlp |grep 9090 
    register: prometheus

  - name: 将结果输出到控制台
    debug: var=prometheus.stdout_lines
[root@ansible-11 reload_Promethes]# cat prometheus.yml
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - 127.0.0.1:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/node.yml"
#   - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:9090']

  - job_name: 'node-1'
    static_configs:
    - targets: ['10.1.234.110:9100']

  - job_name: 'node-2'
    static_configs:
    - targets: ['10.1.234.111:9100']

  - job_name: 'node-3'
    static_configs:
    - targets: ['10.1.234.112:9100']

  - job_name: 'pushgateway'
    static_configs:
    - targets: ['10.1.234.110:9091']
[root@ansible-11 reload_Promethes]# cat node.yml 
# groups:组告警
groups:
# name:组名。报警规则组名称
- name: general.rules
  # rules:定义角色
  rules:
  # alert:告警名称。 任何实例5分钟内无法访问发出告警
  - alert: NodeFilesystemUsage_disk
    # expr:表达式。 获取磁盘使用率 大于百分之80 触发
    expr: 100 - (node_filesystem_free_bytes{mountpoint="/",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
    # for:持续时间。 表示持续一分钟获取不到信息,则触发报警。0表示不使用持续时间
    for: 1m
    # labels:定义当前告警规则级别
    labels:
      # severity: 指定告警级别。
      severity: warning
    # annotations: 注释 告警通知
    annotations:
      # 调用标签具体指附加通知信息
      summary: "Instance {{ $labels.instance  }} :{{ $labels.mountpoint }} 分区使用率过高" # 自定义摘要
      description: "{{ $labels.instance  }} : {{ $labels.job  }} :{{ $labels.mountpoint  }} 这个分区使用大于百分之80% (当前值:{{ $value }})" # 自定义具体描述
[root@ansible-11 promethes]# cat inster_promethes.sh 
#!/bin/bash
echo '--------我只用了3台搭建,增加node节点请自己加---------'
read -p '请输入promethesmaster的ip地址:' x
read -p '请输入promethes node1的ip地址:' y
read -p '请输入promethes node2的ip地址:' z

for i in `grep -r 10.1.234.110 . |awk -F : '{print $1}' |uniq`
do
 sed -i "s#10.1.234.110#$x#g" $i
done 

for j in `grep -r 10.1.234.111 . |awk -F : '{print $1}' |uniq`
do
 sed -i "s#10.1.234.111#$y#g" $j
done 

for k in `grep -r 10.1.234.112 . |awk -F : '{print $1}' |uniq`
do
 sed -i "s#10.1.234.112#$z#g" $k
done 

ansible-playbook -i hosts Prometheus.yaml
ansible-playbook -i hosts node_exporter.yaml
ansible-playbook -i hosts Grafana.yaml
ansible-playbook -i hosts Alertmanager.yaml
ansible-playbook -i hosts Pushgateway.yaml
echo 'promethes安装完成'
cd reload_Promethes
ansible-playbook -i hosts reload_promethes.yaml
echo '规则添加完成,请导入模板'

echo '-----------修改报警的微信不要用我的--------------'

 ##########################################################################

直接执行 ansible-palybook -i host inster_promethes.sh     大概4-5分钟安装完成,测试报警完成

看起来有点繁琐,后面再改进

###################################################################################

企业微信告警配置

 

 

[root@ansible-11 ~]# cat ansible/prometheus/conf/alertmanager.yml 
global:
  # 每2分钟检查一次是否恢复
  resolve_timeout: 2m
  # SMTP的相关配置
  smtp_smarthost: 'smtp.163.com:25'
  smtp_from: '18802676921@163.com'
  smtp_auth_username: '18802676921@163.com'
  smtp_auth_password: '123qqq...A'
# 自定义 通知的模板的 目录 或者 文件.
#templates:
#  - '/usr/local/prometheus/alertmanager/template/wechat.tmpl'
# 路由树的根节点, 每个传进来的报警从这里开始.
route:

  # 将传入的报警中有这些标签的分为一个组.
  # 比如, cluster=A 和 alertname=LatencyHigh 会分成一个组.
  group_by: ['alertname_wechat']
  
  # 指分组创建多久后才可以发送压缩的警报,也就是初次发警报的延时.
  # 这样会确保第一次通知的时候, 有更多的报警被压缩在一起.
  group_wait: 10s

  # 当第一个通知发送,等待多久发送压缩的警报
  group_interval: 10s

  # 默认的接收器
  receiver: 'wechat'

  # 如果报警发送成功, 等待多久重新发送一次
  repeat_interval: 1h
receivers:
#SMTP配置
- name: 'email'
  email_configs:
  - to: '582167559@qq.com'
    send_resolved: true
- name: 'wechat'
  wechat_configs:
  - corp_id: 'wwab37c47350318435'
    to_party: '2'
    agent_id: '1000002'
    api_secret: 'ti3TXKv7sdZs6r7EUZdgpRoUgjR1ne97R8KSYTtPpDY'
    send_resolved: true

####################################################

放入周期性计划任务

[root@test_dc_rpdns_com ~]# crontab -l
SHELL=/bin/sh
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin


#0 */1 * * *  /win/sh/rm_laji_log_data.sh
*/5 * * * * /root/node_exporter_shell.sh
*/5 * * * * /root/port.sh
[root@test_dc_rpdns_com ~]# cat port.sh 
#!/bin/bash
#获取主机名,常传输到Prometheus标签以主机名
instance_name=`hostname -f | cut -d'.' -f1`
#判断主机名不能是localhost不然发送过的数据不知道是那个主机的 
if [ $instance_name == "localhost" ];then
	echo "Hostname must not localhost"
	exit 1
fi
#自定义key,在Prometheus即可使用key查询
label="node_port_8500" 
node_port_8500=`ss -nutlp | grep 8500 |wc -l`
echo "$label $node_port_8500"  | curl --data-binary @- http://10.1.234.110:9091/metrics/job/pushgateway/instance/$instance_name

label="node_port_9201" 
node_port_9201=`ss -nutlp | grep 9201 |wc -l`
echo "$label $node_port_9201"  | curl --data-binary @- http://10.1.234.110:9091/metrics/job/pushgateway/instance/$instance_name

label="node_port_5601" 
node_port_5601=`ss -nutlp | grep 5601 |wc -l`
echo "$label $node_port_5601"  | curl --data-binary @- http://10.1.234.110:9091/metrics/job/pushgateway/instance/$instance_name

 

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
一、prometheus简介 Prometheus是一个开源的系统监控和告警系统,现在已经加入到CNCF基金会,成为继k8s之后第二个在CNCF维护管理的项目,在kubernetes容器管理系统中,通常会搭配prometheus进行监控,prometheus支持多种exporter采集数据,还支持通过pushgateway进行数据上报,Prometheus再性能上可支撑上万台规模的集群。 二、prometheus架构图 三、prometheus组件介绍 1.Prometheus Server: 用于收集和存储时间序列数据。 2.Client Library: 客户端库,检测应用程序代码,当Prometheus抓取实例的HTTP端点时,客户端库会将所有跟踪的metrics指标的当前状态发送到prometheus server端。 3.Exporters: prometheus支持多种exporter,通过exporter可以采集metrics数据,然后发送到prometheus server端 4.Alertmanager: 从 Prometheus server 端接收到 alerts 后,会进行去重,分组,并路由到相应的接收方,发出报警,常见的接收方式有:电子邮件,微信,钉钉, slack等。 5.Grafana:监控仪表盘 6.pushgateway: 各个目标主机可上报数据到pushgatewy,然后prometheus server统一从pushgateway拉取数据。 四、课程亮点 五、效果图展示 六、讲师简介 先超(lucky):高级运维工程师、资深DevOps工程师,在互联网上市公司拥有多年一线运维经验,主导过亿级pv项目的架构设计和运维工作 主要研究方向: 1.云计算方向:容器 (kubernetes、docker),虚拟化(kvm、Vmware vSphere),微服务(istio),PaaS(openshift),IaaS(openstack)等2.系统/运维方向:linux系统下的常用组件(nginx,tomcat,elasticsearch,zookeeper,kafka等),DevOps(Jenkins+gitlab+sonarqube+nexus+k8s),CI/CD,监控(zabbix、prometheus、falcon)等 七、课程大纲
好的,以下是一个基于Docker Compose的搭建PrometheusGrafanaAlertmanager、Blackbox Exporter和Node Exporter的示例: ```yaml version: "3" services: prometheus: image: prom/prometheus container_name: prometheus ports: - "9090:9090" volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml command: - '--config.file=/etc/prometheus/prometheus.yml' - '--web.enable-admin-api' restart: always grafana: image: grafana/grafana container_name: grafana ports: - "3000:3000" volumes: - ./grafana:/var/lib/grafana restart: always alertmanager: image: prom/alertmanager container_name: alertmanager ports: - "9093:9093" volumes: - ./alertmanager/config.yml:/etc/alertmanager/config.yml command: - '--config.file=/etc/alertmanager/config.yml' restart: always blackbox-exporter: image: prom/blackbox-exporter container_name: blackbox-exporter ports: - "9115:9115" volumes: - ./blackbox-exporter/config.yml:/etc/blackbox-exporter/config.yml command: - '--config.file=/etc/blackbox-exporter/config.yml' restart: always node-exporter: image: prom/node-exporter container_name: node-exporter ports: - "9100:9100" restart: always ``` 需要注意的是,这里的配置文件都需要自己创建并且挂载到对应的容器中。例如,prometheus.yml、config.yml和config.yml分别对应PrometheusAlertmanager和Blackbox Exporter的配置文件。同时,Grafana的数据目录也需要挂载到主机上以便数据持久化。 另外,需要注意的是这只是一个示例,具体的配置文件需要根据实际情况进行修改。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值