1/方案
通过Micrometer+actuator暴露拉取metric的端点给Prometheus,通过grafana导入相关优秀模板进行展示,其中增加了自定义指标并dashboard追加自定义panel的展示。
2/依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
3/配置
暴露监控端点,查看是否有metric了http://localhost:8990/actuator/prometheus
#Prometheus springboot监控配置
management:
endpoints:
web:
exposure:
include: 'prometheus' # 暴露/actuator/prometheus
metrics:
tags:
application: ${spring.application.name} # 暴露的数据中添加application label
4/grafana官网挑选模板,推荐选用12856,Prometheus追加配置 并重启
因为Prometheus目前不只支持基于consul,file等的服务发现,暂时只能使用静态配置static_configs
#抓取配置
scrape_configs:
- job_name: 'prometheus'
metrics_path: '/metrics' #默认
scheme: 'http' #默认
# scrape_interval: 10s #覆盖全局
# static_configs:
# - targets: ['localhost:9090']
# labels:
# instance: xxx
#安装参考后面的node-exporter
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
- job_name: 'cadvisor'
static_configs:
- targets: ['localhost:9111']
- job_name: "vehicle-payment"
metrics_path: "/actuator/prometheus"
static_configs:
- targets: ["localhost:8880"]
- job_name: "vehicle-lottery"
metrics_path: "/actuator/prometheus"
5/默认监控指标
查看指标:curl http://localhost:8880/actuator/prometheus,这些指标会在客户端cache一定时间待Prometheus拉取
禁止默认指标:通过yml配置项禁止指标已deprecated,需要在启动类禁止@SpringBootApplication(exclude = {LogbackMetricsAutoConfiguration.class},下面便是禁止logback日志指标后的监控项,主要还有http接口请求耗时,hikaricp连接池,jvm,tomcat,jdbc,系统cpu等
[root@iz2vcbxdfdrfac7remhtorz monitor]# curl http://localhost:8880/actuator/prometheus
# HELP tomcat_sessions_alive_max_seconds
# TYPE tomcat_sessions_alive_max_seconds gauge
tomcat_sessions_alive_max_seconds{application="payment",} 0.0
# HELP http_server_requests_seconds
# TYPE http_server_requests_seconds summary
http_server_requests_seconds_count{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/service-order/count-by-status",} 4.0
http_server_requests_seconds_sum{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/service-order/count-by-status",} 0.104456964
http_server_requests_seconds_count{application="payment",exception="None",method="DELETE",outcome="SUCCESS",status="200",uri="/service-order/remove",} 4.0
http_server_requests_seconds_sum{application="payment",exception="None",method="DELETE",outcome="SUCCESS",status="200",uri="/service-order/remove",} 0.115734939
http_server_requests_seconds_count{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/actuator/prometheus",} 99.0
http_server_requests_seconds_sum{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/actuator/prometheus",} 0.64680158
http_server_requests_seconds_count{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/service-order/page",} 15.0
http_server_requests_seconds_sum{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/service-order/page",} 11.66210826
# HELP http_server_requests_seconds_max
# TYPE http_server_requests_seconds_max gauge
http_server_requests_seconds_max{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/service-order/count-by-status",} 0.0
http_server_requests_seconds_max{application="payment",exception="None",method="DELETE",outcome="SUCCESS",status="200",uri="/service-order/remove",} 0.0
http_server_requests_seconds_max{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/actuator/prometheus",} 0.008541819
http_server_requests_seconds_max{application="payment",exception="None",method="GET",outcome="SUCCESS",status="200",uri="/service-order/page",} 0.0
# HELP tomcat_sessions_rejected_sessions_total
# TYPE tomcat_sessions_rejected_sessions_total counter
tomcat_sessions_rejected_sessions_total{application="payment",} 0.0
# HELP jvm_gc_live_data_size_bytes Size of old generation memory pool after a full GC
# TYPE jvm_gc_live_data_size_bytes gauge
jvm_gc_live_data_size_bytes{application="payment",} 0.0
# HELP hikaricp_connections_max Max connections
# TYPE hikaricp_connections_max gauge
hikaricp_connections_max{application="payment",pool="DatebookHikariCP",} 20.0
# HELP hikaricp_connections_idle Idle connections
# TYPE hikaricp_connections_idle gauge
hikaricp_connections_idle{application="payment",pool="DatebookHikariCP",} 5.0
# HELP system_cpu_usage The "recent cpu usage" for the whole system
# TYPE system_cpu_usage gauge
system_cpu_usage{application="payment",} 0.03063776573572322
# HELP hikaricp_connections_usage_seconds Connection usage time
# TYPE hikaricp_connections_usage_seconds summary
hikaricp_connections_usage_seconds_count{application="payment",pool="DatebookHikariCP",} 39.0
hikaricp_connections_usage_seconds_sum{application="payment",pool="DatebookHikariCP",} 0.425
# HELP hikaricp_connections_usage_seconds_max Connection usage time
# TYPE hikaricp_connections_usage_seconds_max gauge
hikaricp_connections_usage_seconds_max{application="payment",pool="DatebookHikariCP",} 0.0
# HELP jvm_threads_live_threads The current number of live threads including both daemon and non-daemon threads
# TYPE jvm_threads_live_threads gauge
jvm_threads_live_threads{application="payment",} 67.0
# HELP jdbc_connections_active
# TYPE jdbc_connections_active gauge
jdbc_connections_active{application="payment",name="dataSource",} 0.0
# HELP jvm_gc_memory_promoted_bytes_total Count of positive increases in the size of the old generation memory pool before GC to after GC
# TYPE jvm_gc_memory_promoted_bytes_total counter
jvm_gc_memory_promoted_bytes_total{application="payment",} 2.771068E7
# HELP jvm_gc_max_data_size_bytes Max size of old generation memory pool
# TYPE jvm_gc_max_data_size_bytes gauge
jvm_gc_max_data_size_bytes{application="payment",} 0.0
# HELP api_cost_summary 请求耗时summary
# TYPE api_cost_summary summary
api_cost_summary{uri="/service-order/remove",accessType="1",code="0",quantile="0.5",} NaN
api_cost_summary{uri="/service-order/remove",accessType="1",code="0",quantile="0.9",} NaN
api_cost_summary_count{uri="/service-order/remove",accessType="1",code="0",} 4.0
api_cost_summary_sum{uri="/service-order/remove",accessType="1",code="0",} 97.0
api_cost_summary{uri="/service-order/page",accessType="1",code="0",quantile="0.5",} 19.0
api_cost_summary{uri="/service-order/page",accessType="1",code="0",quantile="0.9",} 19.0
api_cost_summary_count{uri="/service-order/page",accessType="1",code="0",} 15.0
api_cost_summary_sum{uri="/service-order/page",accessType="1",code="0",} 11383.0
# HELP jvm_memory_committed_bytes The amount of memory in bytes that is committed for the Java virtual machine to use
# TYPE jvm_memory_committed_bytes gauge
jvm_memory_committed_bytes{application="payment",area="nonheap",id="Metaspace",} 8.3361792E7
jvm_memory_committed_bytes{application="payment",area="heap",id="Par Eden Space",} 2.097152E8
jvm_memory_committed_bytes{application="payment",area="nonheap",id="Code Cache",} 3.1260672E7
jvm_memory_committed_bytes{application="payment",area="heap",id="CMS Old Gen",} 5.767168E8
jvm_memory_committed_bytes{application="payment",area="nonheap",id="Compressed Class Space",} 1.048576E7
jvm_memory_committed_bytes{application="payment",area="heap",id="Par Survivor Space",} 2.62144E7
# HELP process_files_max_files The maximum file descriptor count
# TYPE process_files_max_files gauge
process_files_max_files{application="payment",} 1048576.0
# HELP hikaricp_connections_creation_seconds_max Connection creation time
# TYPE hikaricp_connections_creation_seconds_max gauge
hikaricp_connections_creation_seconds_max{application="payment",pool="DatebookHikariCP",} 0.0
# HELP hikaricp_connections_creation_seconds Connection creation time
# TYPE hikaricp_connections_creation_seconds summary
hikaricp_connections_creation_seconds_count{application="payment",pool="DatebookHikariCP",} 0.0
hikaricp_connections_creation_seconds_sum{application="payment",pool="DatebookHikariCP",} 0.0
# HELP api_cost_histogram 请求耗时histogram
# TYPE api_cost_histogram histogram
api_cost_histogram_bucket{uri="/service-order/remove",accessType="1",code="0",le="100.0",} 4.0
api_cost_histogram_bucket{uri="/service-order/remove",accessType="1",code="0",le="500.0",} 4.0
api_cost_histogram_bucket{uri="/service-order/remove",accessType="1",code="0",le="1000.0",} 4.0
api_cost_histogram_bucket{uri="/service-order/remove",accessType="1",code="0",le="3000.0",} 4.0
api_cost_histogram_bucket{uri="/service-order/remove",accessType="1",code="0",le="+Inf",} 4.0
api_cost_histogram_count{uri="/service-order/remove",accessType="1",code="0",} 4.0
api_cost_histogram_sum{uri="/service-order/remove",accessType="1",code="0",} 97.0
api_cost_histogram_bucket{uri="/service-order/page",accessType="1",code="0",le="100.0",} 11.0
api_cost_histogram_bucket{uri="/service-order/page",accessType="1",code="0",le="500.0",} 11.0
api_cost_histogram_bucket{uri="/service-order/page",accessType="1",code="0",le="1000.0",} 12.0
api_cost_histogram_bucket{uri="/service-order/page",accessType="1",code="0",le="3000.0",} 14.0
api_cost_histogram_bucket{uri="/service-order/page",accessType="1",code="0",le="+Inf",} 15.0
api_cost_histogram_count{uri="/service-order/page",accessType="1",code="0",} 15.0
api_cost_histogram_sum{uri="/service-order/page",accessType="1",code="0",} 11383.0
# HELP system_cpu_count The number of processors available to the Java virtual machine
# TYPE system_cpu_count gauge
system_cpu_count{application="payment",} 8.0
# HELP jvm_threads_peak_threads The peak live thread count since the Java virtual machine started or peak was reset
# TYPE jvm_threads_peak_threads gauge
jvm_threads_peak_threads{application="payment",} 71.0
# HELP jvm_classes_unloaded_classes_total The total number of classes unloaded since the Java virtual machine has started execution
# TYPE jvm_classes_unloaded_classes_total counter
jvm_classes_unloaded_classes_total{application="payment",} 0.0
# HELP jvm_buffer_count_buffers An estimate of the number of buffers in the pool
# TYPE jvm_buffer_count_buffers gauge
jvm_buffer_count_buffers{application="payment",id="mapped",} 0.0
jvm_buffer_count_buffers{application="payment",id="direct",} 11.0
# HELP jvm_buffer_total_capacity_bytes An estimate of the total capacity of the buffers in this pool
# TYPE jvm_buffer_total_capacity_bytes gauge
jvm_buffer_total_capacity_bytes{application="payment",id="mapped",} 0.0
jvm_buffer_total_capacity_bytes{application="payment",id="direct",} 81920.0
# HELP tomcat_sessions_active_current_sessions
# TYPE tomcat_sessions_active_current_sessions gauge
tomcat_sessions_active_current_sessions{application="payment",} 0.0
# HELP tomcat_sessions_active_max_sessions
# TYPE tomcat_sessions_active_max_sessions gauge
tomcat_sessions_active_max_sessions{application="payment",} 0.0
# HELP hikaricp_connections_timeout_total Connection timeout total count
# TYPE hikaricp_connections_timeout_total counter
hikaricp_connections_timeout_total{application="payment",pool="DatebookHikariCP",} 0.0
# HELP jvm_memory_used_bytes The amount of used memory
# TYPE jvm_memory_used_bytes gauge
jvm_memory_used_bytes{application="payment",area="nonheap",id="Metaspace",} 7.8662264E7
jvm_memory_used_bytes{application="payment",area="heap",id="Par Eden Space",} 8.3274296E7
jvm_memory_used_bytes{application="payment",area="nonheap",id="Code Cache",} 3.1095872E7
jvm_memory_used_bytes{application="payment",area="heap",id="CMS Old Gen",} 3.2599328E7
jvm_memory_used_bytes{application="payment",area="nonheap",id="Compressed Class Space",} 9694744.0
jvm_memory_used_bytes{application="payment",area="heap",id="Par Survivor Space",} 1.7527368E7
# HELP jdbc_connections_min
# TYPE jdbc_connections_min gauge
jdbc_connections_min{application="payment",name="dataSource",} 5.0
# HELP hikaricp_connections_acquire_seconds Connection acquire time
# TYPE hikaricp_connections_acquire_seconds summary
hikaricp_connections_acquire_seconds_count{application="payment",pool="DatebookHikariCP",} 39.0
hikaricp_connections_acquire_seconds_sum{application="payment",pool="DatebookHikariCP",} 0.026086932
# HELP hikaricp_connections_acquire_seconds_max Connection acquire time
# TYPE hikaricp_connections_acquire_seconds_max gauge
hikaricp_connections_acquire_seconds_max{application="payment",pool="DatebookHikariCP",} 0.0
# HELP jdbc_connections_idle
# TYPE jdbc_connections_idle gauge
jdbc_connections_idle{application="payment",name="dataSource",} 5.0
# HELP jvm_buffer_memory_used_bytes An estimate of the memory that the Java virtual machine is using for this buffer pool
# TYPE jvm_buffer_memory_used_bytes gauge
jvm_buffer_memory_used_bytes{application="payment",id="mapped",} 0.0
jvm_buffer_memory_used_bytes{application="payment",id="direct",} 81921.0
# HELP jvm_threads_states_threads The current number of threads having NEW state
# TYPE jvm_threads_states_threads gauge
jvm_threads_states_threads{application="payment",state="new",} 0.0
jvm_threads_states_threads{application="payment",state="runnable",} 9.0
jvm_threads_states_threads{application="payment",state="blocked",} 0.0
jvm_threads_states_threads{application="payment",state="terminated",} 0.0
jvm_threads_states_threads{application="payment",state="waiting",} 36.0
jvm_threads_states_threads{application="payment",state="timed-waiting",} 22.0
# HELP jvm_gc_memory_allocated_bytes_total Incremented for an increase in the size of the young generation memory pool after one GC to before the next
# TYPE jvm_gc_memory_allocated_bytes_total counter
jvm_gc_memory_allocated_bytes_total{application="payment",} 1.6777216E9
# HELP api_cost_timer_seconds_max
# TYPE api_cost_timer_seconds_max gauge
api_cost_timer_seconds_max{accessType="1",application="payment",code="0",uri="/service-order/page",} 0.0
api_cost_timer_seconds_max{accessType="1",application="payment",code="0",uri="/service-order/remove",} 0.0
# HELP api_cost_timer_seconds
# TYPE api_cost_timer_seconds summary
api_cost_timer_seconds_count{accessType="1",application="payment",code="0",uri="/service-order/page",} 15.0
api_cost_timer_seconds_sum{accessType="1",application="payment",code="0",uri="/service-order/page",} 11.383
api_cost_timer_seconds_count{accessType="1",application="payment",code="0",uri="/service-order/remove",} 4.0
api_cost_timer_seconds_sum{accessType="1",application="payment",code="0",uri="/service-order/remove",} 0.097
# HELP process_start_time_seconds Start time of the process since unix epoch.
# TYPE process_start_time_seconds gauge
process_start_time_seconds{application="payment",} 1.605842727489E9
# HELP tomcat_sessions_expired_sessions_total
# TYPE tomcat_sessions_expired_sessions_total counter
tomcat_sessions_expired_sessions_total{application="payment",} 0.0
# HELP hikaricp_connections Total connections
# TYPE hikaricp_connections gauge
hikaricp_connections{application="payment",pool="DatebookHikariCP",} 5.0
# HELP process_cpu_usage The "recent cpu usage" for the Java Virtual Machine process
# TYPE process_cpu_usage gauge
process_cpu_usage{application="payment",} 0.0034385745545482964
# HELP jvm_threads_daemon_threads The current number of live daemon threads
# TYPE jvm_threads_daemon_threads gauge
jvm_threads_daemon_threads{application="payment",} 62.0
# HELP jvm_classes_loaded_classes The number of classes that are currently loaded in the Java virtual machine
# TYPE jvm_classes_loaded_classes gauge
jvm_classes_loaded_classes{application="payment",} 14454.0
# HELP system_load_average_1m The sum of the number of runnable entities queued to available processors and the number of runnable entities running on the available processors averaged over a period of time
# TYPE system_load_average_1m gauge
system_load_average_1m{application="payment",} 1.95
# HELP process_files_open_files The open file descriptor count
# TYPE process_files_open_files gauge
process_files_open_files{application="payment",} 75.0
# HELP hikaricp_connections_min Min connections
# TYPE hikaricp_connections_min gauge
hikaricp_connections_min{application="payment",pool="DatebookHikariCP",} 5.0
# HELP jdbc_connections_max
# TYPE jdbc_connections_max gauge
jdbc_connections_max{application="payment",name="dataSource",} 20.0
# HELP hikaricp_connections_pending Pending threads
# TYPE hikaricp_connections_pending gauge
hikaricp_connections_pending{application="payment",pool="DatebookHikariCP",} 0.0
# HELP jvm_gc_pause_seconds Time spent in GC pause
# TYPE jvm_gc_pause_seconds summary
jvm_gc_pause_seconds_count{action="end of minor GC",application="payment",cause="Allocation Failure",} 8.0
jvm_gc_pause_seconds_sum{action="end of minor GC",application="payment",cause="Allocation Failure",} 0.304
# HELP jvm_gc_pause_seconds_max Time spent in GC pause
# TYPE jvm_gc_pause_seconds_max gauge
jvm_gc_pause_seconds_max{action="end of minor GC",application="payment",cause="Allocation Failure",} 0.0
# HELP hikaricp_connections_active Active connections
# TYPE hikaricp_connections_active gauge
hikaricp_connections_active{application="payment",pool="DatebookHikariCP",} 0.0
# HELP tomcat_sessions_created_sessions_total
# TYPE tomcat_sessions_created_sessions_total counter
tomcat_sessions_created_sessions_total{application="payment",} 0.0
# HELP process_uptime_seconds The uptime of the Java virtual machine
# TYPE process_uptime_seconds gauge
process_uptime_seconds{application="payment",} 1185.341
# HELP jvm_memory_max_bytes The maximum amount of memory in bytes that can be used for memory management
# TYPE jvm_memory_max_bytes gauge
jvm_memory_max_bytes{application="payment",area="nonheap",id="Metaspace",} 2.097152E8
jvm_memory_max_bytes{application="payment",area="heap",id="Par Eden Space",} 2.097152E8
jvm_memory_max_bytes{application="payment",area="nonheap",id="Code Cache",} 2.5165824E8
jvm_memory_max_bytes{application="payment",area="heap",id="CMS Old Gen",} 5.767168E8
jvm_memory_max_bytes{application="payment",area="nonheap",id="Compressed Class Space",} 1.073741824E9
jvm_memory_max_bytes{application="payment",area="heap",id="Par Survivor Space",} 2.62144E7
6/采集自定义指标,如请求耗时
默认指标已有请求耗时,但不便自定义扩展,考虑自己采集,自定义指标类型有
Counter:只增不减的计数器
Gauge: 可增可减的计数器
Histogram:自带buckets区间分布统计,如图请求耗时(毫秒)区间:100,500,1000,3000,+Inf正无穷 ,不指定则默认0.01,0.025,0.05,0.075,....7.5,10,+Inf
Summary: 客户端收集中位数,九分位数的数据指标,不指定则不统计分位数(timmer也属于summary,默认单位sencond,并统计max值)
↓↓↓
配置收集指标的bean,非必须,如本案例只需要自动装配的registry,通过registry获取timer收集数据
@Configuration
public class PrometheusMetricsConfig {
@Autowired
private PrometheusMeterRegistry registry;
@Bean
Histogram getHistogram(){
return Histogram.build().labelNames("uri", "accessType", "code")
.name("api_cost_histogram").help("请求耗时histogram")
.buckets(100,500,1000,3000).register(registry.getPrometheusRegistry());
}
@Bean
Summary getSummary(){
return Summary.build().labelNames("uri", "accessType", "code")
.name("api_cost_summary").help("请求耗时summary")
.quantile(0.5, 0.05)
.quantile(0.9, 0.01)
.register(registry.getPrometheusRegistry());
}
}
采集数据,一般在aop采集
。。。。。
@Autowired
private PrometheusMeterRegistry registry;
@Autowired
Histogram histogram;
@Autowired
Summary summary;
。。。。。
//logBuilder是封装请求参数,响应参数里重要数据的一个entity
private void collectPrometheusMetric(LogBuilder logBuilder) {
Tag tag1 = new ImmutableTag("uri", logBuilder.getUri());
Tag tag2 = new ImmutableTag("code", logBuilder.getCode());
Tag tag3 = new ImmutableTag("accessType", logBuilder.getAccessType());
registry.timer("api_cost_timer",Lists.newArrayList(tag1, tag2,tag3)).record(logBuilder.getCost(), TimeUnit.MILLISECONDS);
summary.labels(logBuilder.getUri(),logBuilder.getAccessType(),logBuilder.getCode()).observe(logBuilder.getCost());
histogram.labels(logBuilder.getUri(),logBuilder.getAccessType(),logBuilder.getCode()).observe(logBuilder.getCost());
}
输出指标解读:
summary:请求共6次,6次总耗时802ms,中位数23ms,9分位数29ms
histogram:请求共6次,6次总耗时802ms,100ms以下5次,500ms以下5次,1000ms以下共5次,3000ms以下共5次,+Inf ms以下共5次
timer:请求共6次,6次总耗时0.802s,最大值0.693s,注意单位是s
6/grafana自定义panel
grafana如果没饼图,需要使用bin目录下的grafana-cli命令安装插件:./grafana-cli plugins install grafana-piechart-panel。关于promQL语法参考https://yunlzheng.gitbook.io/prometheus-book/part-iii-prometheus-shi-zhan/readmd
效果图
注意query表达式如引用模板变量,则不能配置Alert报警????
7/grafana添加alert
默认带模板参数的query添加alert会报错,解决方案:
方案1:客户端收集数据就判断一次,如接口超时或特殊状态码直接钉钉报警
方案2:使用Prometheus alertmanager配置报警
方案3:曲线救国(本例使用)
复制一个query B,去掉模板变量,并勾选instant关闭(为了不展示query B的图像),然后配置alert即可,如图满足为:(now-5m,now)时间内,每10s评估一次,持续1分钟,query B值大于0.5秒未解除,alert由pending->alertting发出报警
8/较全面的监控系统总结
业务方面监控:通过本案例自定义指标进行监控+监控工具类监控 ,grafana模板推荐:12856+自定义控制
应用本身监控:本案例默认指标监控,jvm,gc,logback错误日志(alert)
容器监控:cadvisor,或 shell +docker stats,grafana模板推荐:893/395
进程监控:shell+cron+dingding
系统监控:node-exporter,grafana模板推荐:8919