一、安装docker和docker-compose
root@tjsc:~# apt install docker.io docker-compose -y
完成后可查看docker及docker-compose 版本
root@tjsc:~# docker -v
Docker version 24.0.5, build 24.0.5-0ubuntu1~20.04.1
root@tjsc:~# docker-compose -v
docker-compose version 1.25.0, build unknown
二、容器编排
创建docker-compose.yml文件
root@tjsc:~# touch docker-compose.yml
将下列内容复制到该文件中
version: '2'
services:
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
restart: always
prometheus:
image: prom/prometheus
container_name: prometheus
hostname: prometheus
restart: always
volumes:
- ./prometheus/conf/:/etc/prometheus/
ports:
- '9090:9090'
cadvisor:
image: google/cadvisor:latest
container_name: cadvisor
hostname: cadvisor
restart: always
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- '8080:8080'
grafana:
image: grafana/grafana:latest
container_name: grafana
hostname: grafana
restart: always
volumes:
- ./grafana/grafana-storage:/var/lib/grafana
ports:
- '13000:3000'
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
hostname: alertmanager
restart: always
volumes:
- ./prometheus/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- ./prometheus/template:/etc/alertmanager/template
ports:
- '9093:9093'
blackbox-exporter:
image: prom/blackbox-exporter:latest
container_name: blackbox-exporter
hostname: blackbox-exporter
restart: always
ports:
- '9115:9115'
在同一文件夹下,使用docker-compose up -d 后台启动容器
root@tjsc:~# docker-compose up -d
关闭容器的方法
root@tjsc:~# docker-compose down
二、初始配置
- 目录结构如下(可自行创建)如docker-compose up -d 命令运行失败或报错,请检查目录权限
- 初始配置,文件位置见图片
2.1 Prometheus.yml (该文件配置要监控的主机、指定规则文件等)
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting: #指定alertmanager报警组件地址
alertmanagers:
- static_configs:
- targets: [ '192.168.136.100:9093']
rule_files: #指定报警规则文件
- "rules.yml"
scrape_configs:
- job_name: 'nodehost'
static_configs:
- targets: ['192.168.136.100:9100','192.168.136.100:8080']
labels:
appname: 'Node1'
- job_name: 'prometheus'
static_configs:
- targets: [ '192.168.136.100:9090']
labels:
appname: 'prometheus'
- job_name: 'prometheus_port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ['192.168.136.100:12345']
labels:
instance: 'port_status'
group: 'tcp'
relabel_configs: #此处为blackbox-exporter配置
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.136.100:9115
2.2 rules.yml (该文件配置报警规则)
groups:
- name: example #定义规则组
rules:
- alert: InstanceDown #定义报警名称
expr: probe_success == 0 #Promql语句,触发规则
for: 30s
labels: #标签定义报警的级别和主机
name: instance
severity: Critical
annotations:
summary: " {{ $labels.appname }}" #报警摘要,取报警信息的appname名称
description: " 服务停止运行 " #报警信息
value: "{{ $value }}%" # 当前报警状态值
2.3 alertmanager.yml(该文件配置报警方式)
global:
resolve_timeout: 5m
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
templates:
- '/etc/alertmanager/template/wechat.tmpl'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 5m
receiver: 'wechat'
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: <企业ID> #此处为企业微信的企业ID
to_party: '1' #此处为企业微信的部门ID
agent_id: '1000002' #此处为企业微信应用的AgentId
api_secret: <secret> #此处为企业微信应用的secret
send_resolved: true