一、部暑架构
服务器IP | 共同角色 | 单独角色 |
---|---|---|
192.168.11.193 | prometheus1,sidecar1,alertmanager1,query1,storer1 | minio,traefik.grafana |
192.168.11.194 | prometheus2,sidecar2,alertmanager2,query2,storer2 | compactor,domain_exporter,bucket,ruler |
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Thanos │───────────┬────▶ │ Thanos Store │ │ Thanos │
│ Query │ │ │ Gateway │ │ Compactor │
└──────────────┘ │ └──────────────┘ └──────────────┘
push │ │ │
┌──────────────┐ alerts ┌──────────────┐ │ │ storages │ Downsample &
│ Alertmanager │ ◀──────────│ Thanos │ ◀────┤ │ query metrics │ compact blocks
│ (*) │ │ Ruler │ │ │ │
└──────────────┘ └──────────────┘ │ ▼ │
▲ │ │ ┌────────────────┐ │
│ push alerts └──────────────│────▶ │ MinIO® (*) │ ◀─────────┘
│ │ │ │
┌ ── ── ── ── ── ── ── ── ── ──┐ │ └────────────────┘
│┌────────────┐ ┌────────────┐│ │ ▲
││ Prometheus │─▶│ Thanos ││ ◀────────────────┘ │
││ (*) │◀─│ Sidecar (*)││ query │ inspect
│└────────────┘ └────────────┘│ metrics │ blocks
└ ── ── ── ── ── ── ── ── ── ──┘ │
┌──────────────┐
│ Thanos │
│ Bucket Web │
└──────────────┘
二、 193节点上部暑
- 2.1 minio是thanos需要的obj存储,必须先安装
mkdir -p /data/minio/{data,config}
cat > /data/minio/start.sh << 'EOF'
docker run -d \
-p 9000:9000 \
-p 9001:9001 \
--name minio \
--restart=always \
-e "MINIO_ROOT_USER=admin" \
-e "MINIO_ROOT_PASSWORD=admin123456" \
-e "MINIO_PROMETHEUS_AUTH_TYPE=public" \
-v /data/minio/data:/data/minio/data \
-v /data/minio/config:/root/.minio \
-v /etc/localtime:/etc/localtime \
minio/minio \
server /data/minio/data \
--console-address ":9001"
EOF
bash /data/minio/start.sh
-
访问minio
http://192.168.11.193:9001
帐号 : admin123456
密码 : admin123456 -
创建存储桶
三、193,194节点上部暑(prometheus,sidecar,alertmanager,query,storer)
- 3.1、 prometheus
promethues集群安装时,external_labels要修改
#创建prometheus工作目录
mkdir /data/prometheus/{data,conf,conf/rules,conf/sd_config} -p
chown -R 65534:65534 /data/prometheus/data
#promethes配置文件
cat > /data/prometheus/conf/prometheus.yml << 'EOF'
global:
scrape_interval: 30s
evaluation_interval: 30s
scrape_timeout: 10s
#以下external_labels,不同的prometheus有不同的值,不然会导致安装失败
external_labels:
region: GuangZhou
replica: A
#加载警报规则
rule_files:
- "/etc/prometheus/rules/*.rules"
#集成alertmanager高可用
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.11.193:9093
- 192.168.11.194:9093
timeout: 10s
scrape_configs:
#promethes自身的监控
- job_name: prometheus
metrics_path: '/metrics' #默认
scheme: 'http' #默认
scrape_interval: 30s #覆盖全局
static_configs:
- targets: ['localhost:9090']
labels:
instance: prometheus
- job_name: grafana
metrics_path: /metrics
static_configs:
- targets:
- 192.168.11.193:3000
- job_name: domain
metrics_path: /probe
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: domain
- target_label: __address__
replacement: 192.168.11.194:9222
static_configs:
- targets:
- www.21cn.com
- www.163.com
- www.ifeng.com
- www.qq.com
- www.sina.com.cn
- job_name: thanos_sidecar
static_configs:
- targets:
- 192.168.11.193:19191
- 192.168.11.194:19191
- 192.168.11.194:29191
EOF
cat >/data/prometheus/conf/rules/alert.yml<< 'EOF'
groups:
- name: prometheus
rules:
- alert: prometheus节点UP状态
expr: sum(up{job="prometheus"})==1
for: 1m
labels:
severity: 严重
team: node-prometheus
annotations:
summary: "{{ $labels.job }} 已停止运行超过 1分钟!"
description: "{{ $labels.instance }} 异常停止,请尽快处理!"
value: '{{ $value }}'
EOF
#启动脚本
cat > /data/prometheus/start.sh << 'EOF'
docker run -d \
--name prometheus \
--restart=always \
-p 9090:9090 \
-v /data/prometheus/conf/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /data/prometheus/conf/rules:/etc/prometheus/rules \
-v /data/prometheus/conf/sd_config:/etc/prometheus/sd_config \
-v /data/prometheus/data:/data/prometheus/data \
-v /etc/localtime:/etc/localtime:ro \
prom/prometheus:v2.28.0 \
--web.read-timeout=5m \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/data/prometheus/data \
--web.max-connections=512 \
--storage.tsdb.retention=30d \
--query.timeout=2m \
--web.enable-lifecycle \
--web.listen-address=:9090 \
--web.enable-admin-api \
--web.console.libraries=/usr/share/prometheus/console_libraries \
--web.console.templates=/usr/share/prometheus/consoles \
--storage.tsdb.min-block-duration=2h \
--storage.tsdb.max-block-duration=2h
EOF
bash /data/prometheus/start.sh
- 3.2、alertmanager高可用
#创建grafana工作目录
mkdir /data/alertmanager/{conf,template,data} -p
chown -R 65534:65534 /data/alertmanager/data
#promethes配置文件
cat > /data/alertmanager/conf/alertmanager.yml << 'EOF'
global:
resolve_timeout: 1m
smtp_from: 'xxxxxxxx@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: 'xxxxxx@qqq.com'
smtp_auth_password: 'XXXXXX'
smtp_require_tls: false
smtp_hello: 'qq.com'
templates:
- '/etc/alertmanager/email.tmpl' #邮件模板文件,容器内的路径
route:
receiver: 'wechat.webhook'
#按alertname等进行分组
group_by: ['alertname']
#周期内有同一组的报警到来则一起发送
group_wait: 1m
#报警发送周期
group_interval: 10m
#与上次相同的报警延迟30m才发送,这里应该是(10+30)m左右
repeat_interval: 30m
routes:
#可以使用match_re正则匹配
- match:
severity: 严重
#匹配上则发给下面的name=email的receivers
receiver: wechat.webhook
receivers:
#企微机器人(方法2)
- name: 'wechat.webhook'
webhook_configs:
- url: 'http://192.168.11.221:18089/alert0'
send_resolved: false
- name: 'web.hook'
webhook_configs:
- url: 'http://172.31.23.2:8080'
- name: 'email'
email_configs:
- to: 'xxxxxxxx@qq.com'
html: '{{ template "email.jwolf.html" . }}'
send_resolved: true
#抑制规则,(如果是critical时,抑制warning警报)
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
EOF
cat > /data/alertmanager/conf/email.tmpl<< 'EOF'
{{ define "email.jwolf.html" }}
{{ range $i ,$alert := .Alerts }}
=========start==========<br>
告警级别: {{ $alert.Labels.severity }} <br>
告警类型: {{ $alert.Labels.alertname }} <br>
故障主机: {{ $alert.Labels.instance }} <br>
告警主题: {{ $alert.Annotations.summary }} <br>
告警详情: {{ $alert.Annotations.description }} <br>
告警阈值: {{ $alert.Annotations.value }} <br>
触发时间: {{ $alert.StartsAt }} <br>
=========end==========<br>
{{ end }}
{{ end }}
EOF
#启动脚本
cat > /data/alertmanager/start.sh << 'EOF'
docker run -d \
--name alertmanager \
--restart=always \
-p 9093:9093 \
-p 9094:9094 \
-v /data/alertmanager/conf/:/etc/alertmanager/ \
-v /data/alertmanager/data:/alertmanager \
-v /etc/localtime:/etc/localtime:ro \
prom/alertmanager:v0.22.2 \
--config.file="/etc/alertmanager/alertmanager.yml" \
--cluster.listen-address="0.0.0.0:9094" \
--cluster.peer=192.168.11.193:9094 \
--cluster.peer=192.168.11.194:9094
EOF
bash /data/alertmanager/start.sh
- 3.3、thanos_sidecar
mkdir -p /data/thanos/conf -p
cat > /data/thanos/conf/bucket_config.yaml << 'EOF'
type: S3
config:
bucket: "thanos"
endpoint: "192.168.11.193:9000"
access_key: "admin"
secret_key: "admin123456"
insecure: true
EOF
cat > /data/thanos/conf/query.yaml << 'EOF'
#所有querier节点
- targets:
- 192.168.11.193:19192
- 192.168.11.194:19192
EOF
cat > /data/thanos/conf/store.yaml << 'EOF'
#所有sidecar节点和store节点
- targets:
#sidecar节点
- 192.168.11.193:19090
- 192.168.11.194:19090
#store节点
- 192.168.11.193:29090
- 192.168.11.194:29090
EOF
cat > /data/thanos/thanos_sidecar.sh << 'EOF'
docker run -d \
--name thanos_sidecar \
--restart=always \
--network host \
-v /data/prometheus/data:/data/prometheus/data \
-v /etc/localtime:/etc/localtime:ro \
-v /data/thanos/conf/bucket_config.yaml:/bucket_config.yaml \
improbable/thanos:v0.6.0 \
sidecar \
--tsdb.path=/data/prometheus/data \
--prometheus.url=http://192.168.11.193:9090 \
--objstore.config-file=/bucket_config.yaml \
--http-address=0.0.0.0:19191 \
--grpc-address=0.0.0.0:19090
#-p 19090:19090 \
#-p 19091:19091 \
EOF
bash /data/thanos/thanos_sidecar.sh
- 3.4、thanos_querier
mkdir -p /data/thanos/conf -p
cat > /data/thanos/thanos_querier.sh << 'EOF'
docker run -d \
--name thanos_querier \
--restart=always \
-l "traefik.enable=true" \
-l "traefik.port=19192" \
-l "traefik.frontend.rule=PathPrefix:/" \
-p 19192:19192 \
-v /data/thanos/conf/store.yaml:/store.yaml \
-v /etc/localtime:/etc/localtime:ro \
improbable/thanos:v0.6.0 \
query --http-address=0.0.0.0:19192 \
--store.sd-files=store.yaml \
--query.replica-label=replica
#--store=192.168.11.193:19090 \
#--store=192.168.11.194:19090 \
#--store=192.168.11.194:29090 \
EOF
bash /data/thanos/thanos_querier.sh
访问querier: http://192.168.11.193:19192/graph
- 5、thanos_storer
cat > /data/thanos/thanos_storer.sh << 'EOF'
docker run -d \
--name thanos_storer \
--restart=always \
-p 29191:19191 \
-p 29090:19090 \
-v /etc/localtime:/etc/localtime:ro \
-v /data/thanos/conf/bucket_config.yaml:/bucket_config.yaml \
improbable/thanos:v0.6.0 \
store \
--data-dir=/var/thanos/store \
--objstore.config-file=/bucket_config.yaml \
--http-address=0.0.0.0:19191 \
--grpc-address=0.0.0.0:19090 \
--index-cache-size=1GB \
--chunk-pool-size=8GB
EOF
bash /data/thanos/thanos_storer.sh
四、193节点上安装
- 4.1、 grafana
#创建grafana工作目录
mkdir /data/grafana/data -p
chown 472 /data/grafana/data
#启动脚本
cat > /data/grafana/start.sh << 'EOF'
docker run -d \
--name grafana \
--restart=always \
--user 472 \
-p 3000:3000 \
-e TZ="Asia/Shanghai" \
-e "GF_SECURITY_ADMIN_PASSWORD=Grafana@2O21" \
-e "GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel" \
-v /data/grafana/data:/var/lib/grafana \
-v /etc/localtime:/etc/localtime:ro \
grafana/grafana:7.5.9
EOF
bash /data/grafana/start.sh
http://192.168.11.193:3000
用户:admin
密码:Grafana@2O21
#traefik在单机部暑下可用,集群部暑不可用
- 4.2、 traefik
mkdir /data/traefik/ -p
cat > /data/traefik/traefik.toml << 'EOF'
[global]
checkNewVersion = false
sendAnonymousUsage = false
[accessLog]
[log]
level = "INFO"
[ping]
[api]
[retry]
[docker]
endpoint = "unix:///var/run/docker.sock"
watch = true
exposedByDefault = false
EOF
cat > /data/traefik/start.sh << 'EOF'
docker run -d \
--name traefik \
--restart=always \
-p 80:80 \
-p 8080:8080 \
-v /etc/localtime:/etc/localtime:ro \
-v /data/traefik/traefik.toml:/traefik.toml \
-v /var/run/docker.sock:/var/run/docker.sock:ro \
traefik:v1.7
EOF
bash /data/traefik/start.sh
访问: http://192.168.11.193:8080
五、 194节点上部暑
- 5.1、 domain_exporter
mkdir /data/domain_exporter/ -p
cat > /data/domain_exporter/start.sh << 'EOF'
docker run -d \
--name domain_exporter \
--restart=always \
-p 9222:9222 \
-v /etc/localtime:/etc/localtime:ro \
caarlos0/domain_exporter:v1
EOF
bash /data/domain_exporter/start.sh
- 5.2、thanos_compactor
cat > /data/thanos/thanos_compactor.sh << 'EOF'
docker run -d \
--name thanos_compactor \
--restart=always \
-v /data/thanos/conf/bucket_config.yaml:/bucket_config.yaml \
-v /data/thanos/compact_data:/var/thanos/compact \
-v /etc/localtime:/etc/localtime:ro \
improbable/thanos:v0.6.0 \
compact \
--data-dir=/var/thanos/compact \
--objstore.config-file=/bucket_config.yaml \
--http-address=0.0.0.0:19191 \
--wait
EOF
bash /data/thanos/thanos_compactor.sh
- 5.3、thanos_bucket
cat > /data/thanos/thanos_bucket.sh << 'EOF'
docker run -d \
--name thanos_bucket \
--restart=always \
-p 19194:19194 \
-v /etc/localtime:/etc/localtime:ro \
-v /data/thanos/conf/bucket_config.yaml:/bucket_config.yaml \
improbable/thanos:v0.6.0 \
bucket web \
--objstore.config-file=/bucket_config.yaml \
--listen="0.0.0.0:19194"
EOF
bash /data/thanos/thanos_bucket.sh
访问bucket
http://192.168.11.194:19194
命令行
docker exec -it thanos_bucket thanos bucket --objstore.config-file=/bucket_config.yaml inspect
- 5.4、thanos_rule
mkdir -p /data/thanos/rule_data/{data,rules,conf}
cat > /data/thanos/thanos_rule.sh << 'EOF'
docker run -d \
--restart=always \
--name thanos_rule \
-v /etc/localtime:/etc/localtime:ro \
-v /data/thanos/rule_data/data:/data \
-v /data/thanos/rule_data/rules:/rules \
-v /data/thanos/conf/bucket_config.yaml:/bucket_config.yaml \
-v /data/thanos/rule_data/conf/:/conf/ \
-p 10905:10905 \
-p 19193:19193 \
improbable/thanos:v0.6.0 \
rule \
--grpc-address=0.0.0.0:10905 \
--http-address=0.0.0.0:19193 \
--data-dir=/data \
--eval-interval=30s \
--rule-file=/rules/*.yml \
--query.sd-files=/conf/query.yml \
--objstore.config-file=/bucket_config.yaml \
--alertmanagers.url=http://192.168.11.193:9093 \
--alertmanagers.url=http://192.168.11.194:9093 \
--label 'region="GuangZhou"' \
--label 'replica="A"' \
--alert.label-drop=replica
EOF
bash /data/thanos/thanos_rule.sh
六、其它
- 6.1、grafana配置
grafanaid: 14605