一、Kafka_exporter 监控 Kafka
1、kafka的安装方法参考:https://blog.csdn.net/u010533742/article/details/136616708
2、kafka_exporter部暑(指定版本为1.7)
mkdir /data/kafka_exporter -p
cat > /data/kafka_exporter/start.sh << 'EOF'
#!/bin/bash
docker rm -f kafka_exporter
cd `dirname $0`
docker run -d \
--name kafka_exporter \
--restart=always \
--network host \
-v /etc/localtime:/etc/localtime:ro \
danielqsj/kafka-exporter:v1.7.0 \
--web.listen-address :9308 \
--sasl.enabled \
--sasl.mechanism plain \
--kafka.server 192.168.11.192:9092 \
--sasl.username=alice \
--sasl.password=alice \
--log.level info \
--web.telemetry-path /metrics
EOF
kafka_exporter metrics的访问方法
http://192.168.11.192:9308/metrics
3、配置prometheus
#添加自动发现脚本
cat >> /data/prometheus/conf/prometheus.yml << 'EOF'
#kafka自动发现
- job_name: 'kafka'
file_sd_configs:
- files:
- /etc/prometheus/sd_config/kafka.yaml
refresh_interval: 5s
relabel_configs:
- source_labels: [__address__]
regex: (.*)
target_label: instance
replacement: $1
- source_labels: [__address__]
regex: (.*):(.*)
target_label: ip
replacement: $1
- source_labels: [__address__]
regex: (.*):(.*)
target_label: __address__
replacement: $1:9308
EOF
#自动发现配置
cat > /data/prometheus/conf/sd_config/kafka.yaml << 'EOF'
#kafka自动发现
- labels:
type: kafka
targets:
- 192.168.11.192:9092
- 192.168.11.193:9092
- 192.168.11.194:9092
EOF
访问 http://192.168.11.221:9090
4、配置grafana
grafanaid: 7589
自定义图表,请参考:https://blog.csdn.net/qq_34864753/article/details/103953385
5、警报规则
cat > /data/prometheus/conf/rules/kafka.rules << 'EOF'
groups:
- name: kafka-监控告警
rules:
- alert: 告警!Kafka Topics 副本数少于3
expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
for: 0m
labels:
severity: 严重告警
annotations:
summary: "{{ $labels.instance }} Kafka topics 副本数少于3"
description: "Kafka topic 分区不同步\n 当前值 = {{ $value }}"
- alert: 告警!KafkaConsumersGroup
expr: sum(kafka_consumergroup_lag) by (consumergroup) > 50
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{ $labels.instance }} Kafka consumers group"
description: "Kafka consumers group\n 当前值 = {{ $value }}"
EOF
二、 jmx_exporter 监控kafka_kraft
1、下载kakfa_kraft的jmx配置
mkdir /data/kafka/etc -p
cd /data/kafka/etc/
wget https://raw.githubusercontent.com/prometheus/jmx_exporter/main/example_configs/kafka-kraft-3_0_0.yml
2、下载jmx_exporter插件
cd /data/kafka/
wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.20.0/jmx_prometheus_javaagent-0.20.0.jar
3、启动kafka,并加载jmx_exporter插件
mkdir /data/kafka/etc -p
cat > /data/kafka/start.sh << 'EOF'
#!/bin/bash
cd `dirname $0`
docker rm -f kafka
docker run -d \
--name kafka \
--restart=always \
--net host \
-e KAFKA_NODE_ID=0 \
-e KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://192.168.11.100:9092 \
-e KAFKA_HEAP_OPTS="-Xmx512m -Xms512m" \
-e KAFKA_OPTS="-javaagent:/opt/jmx_prometheus_javaagent-0.20.0.jar=9999:/opt/kafka-kraft-3_0_0.yml" \
-e KAFKA_CFG_PROCESS_ROLES=broker,controller \
-e KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER \
-e KAFKA_CFG_SASL_MECHANISM_CONTROLLER_PROTOCOL=PLAIN \
-e KAFKA_CONTROLLER_USER=contr0ller \
-e KAFKA_CONTROLLER_PASSWORD=Contr0ller#XXXX \
-e KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:SASL_PLAINTEXT,CONTROLLER:SASL_PLAINTEXT \
-e KAFKA_CFG_LISTENERS=PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093 \
-e KAFKA_ENABLE_KRAFT=yes \
-e KAFKA_KRAFT_CLUSTER_ID="Aqvf7RVETX-DInZbNUXXXX" \
-e KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@192.168.11.100:9093 \
-e ALLOW_PLAINTEXT_LISTENER=yes \
-e KAFKA_TLS_CLIENT_AUTH=none \
-e KAFKA_CFG_SASL_ENABLED_MECHANISMS=PLAIN \
-e KAFKA_CLIENT_LISTENER_NAME=PLAINTEXT \
-e KAFKA_CLIENT_USERS=gohangout \
-e KAFKA_CLIENT_PASSWORDS=Gohangout#XXXX \
-e KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true \
-v `pwd`/etc/sasl_config.properties:/opt/bitnami/kafka/config/sasl_config.properties \
-v `pwd`/etc/kafka-kraft-3_0_0.yml:/opt/kafka-kraft-3_0_0.yml \
-v `pwd`/jmx_prometheus_javaagent-0.20.0.jar:/opt/jmx_prometheus_javaagent-0.20.0.jar \
-v `pwd`/data:/bitnami/kafka/ \
-v /etc/localtime:/etc/localtime \
bitnami/kafka:3.7.0
EOF
bash /data/kafka/start.sh
4、测试插件是否可用
curl http://192.168.11.100:9999/metrics
5、配置prometheus
prometheus.yml
- job_name: 'kafka-jmx'
file_sd_configs:
- files:
- /etc/prometheus/sd_config/kafka-jmx.yaml
refresh_interval: 5s
relabel_configs:
- source_labels: [__address__]
regex: (.*)
target_label: instance
replacement: $1
- source_labels: [__address__]
regex: (.*):(.*)
target_label: ip
replacement: $1
- source_labels: [__address__]
regex: (.*):(.*)
target_label: __address__
replacement: $1:9999
kafka-jmx.yaml
- labels:
type: kafka-jmx
targets:
- 192.168.11.100:9092
- 192.168.11.101:9092
- 192.168.11.102:9092
6、下载grafana_dashboard
id: 11962
id: 18276
7、rules(jmx_exporter)
cat >kafka_jmx.rules <<'EOF'
groups:
- name: kafka-jmx-监控告警
rules:
- alert: 告警!Kafka 复制分区不足
expr: kafka_server_replicamanager_underreplicatedpartitions > 0
for: 1m
labels:
severity: 严重告警
annotations:
description: "{{ $labels.instance }} Kafka 复制分区不足,当前值 = {{ $value }}"
- alert: 告警!Kafka Controller 大于 1
expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (instance,ip) > 1
for: 1m
labels:
severity: 严重告警
annotations:
description: "{{ $labels.instance }} Kafka Controller数量,当前值 = {{ $value }}"
- alert: 告警!Kafka 离线分区数
expr: kafka_controller_kafkacontroller_offlinepartitionscount > 0
for: 1m
labels:
severity: 严重告警
annotations:
description: "{{ $labels.instance }} Kafka 离线分区数,当前值 = {{ $value }}"
EOF