基于SpringBoot2.0+ Actuator metrics的监控(基于Oracle JDK9,G1)
引言
SpringBoot2在spring-boot-actuator中引入了micrometer,对1.x的metrics进行了重构,另外支持对接的监控系统也更加丰富(Atlas、Datadog、Ganglia、Graphite、Influx、JMX、NewRelic、Prometheus、SignalFx、StatsD、Wavefront)。本文以Prometheus为例阐述SpringBoot2.0的监控。
eg. Prometheus
Maven坐标
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
Prometheus地址
http://ip:port/actuator/prometheus
-
# HELP jvm_memory_committed_bytes The amount of memory in bytes that is committed for the Java virtual machine to use
-
# TYPE jvm_memory_committed_bytes gauge
-
jvm_memory_committed_bytes{area="nonheap",id="Code Cache",} 1.671168E7
-
jvm_memory_committed_bytes{area="nonheap",id="Metaspace",} 4.5481984E7
-
jvm_memory_committed_bytes{area="nonheap",id="Compressed Class Space",} 6160384.0
-
jvm_memory_committed_bytes{area="heap",id="PS Eden Space",} 1.3631488E8
-
jvm_memory_committed_bytes{area="heap",id="PS Survivor Space",} 1.4155776E7
-
jvm_memory_committed_bytes{area="heap",id="PS Old Gen",} 1.08527616E8
-
# HELP tomcat_cache_access_total
-
# TYPE tomcat_cache_access_total counter
-
tomcat_cache_access_total 0.0
-
# HELP jvm_gc_pause_seconds Time spent in GC pause
-
# TYPE jvm_gc_pause_seconds summary
-
jvm_gc_pause_seconds_count{action="end of minor GC",cause="Metadata GC Threshold",} 1.0
-
jvm_gc_pause_seconds_sum{action="end of minor GC",cause="Metadata GC Threshold",} 0.012
-
jvm_gc_pause_seconds_count{action="end of major GC",cause="Metadata GC Threshold",} 1.0
-
jvm_gc_pause_seconds_sum{action="end of major GC",cause="Metadata GC Threshold",} 0.061
-
jvm_gc_pause_seconds_count{action="end of minor GC",cause="Allocation Failure",} 1.0
-
jvm_gc_pause_seconds_sum{action="end of minor GC",cause="Allocation Failure",} 0.006
-
# HELP jvm_gc_pause_seconds_max Time spent in GC pause
-
# TYPE jvm_gc_pause_seconds_max gauge
-
jvm_gc_pause_seconds_max{action="end of minor GC",cause="Metadata GC Threshold",} 0.012
-
jvm_gc_pause_seconds_max{action="end of major GC",cause="Metadata GC Threshold",} 0.061
-
jvm_gc_pause_seconds_max{action="end of minor GC",cause="Allocation Failure",} 0.006
-
# HELP jvm_gc_memory_promoted_bytes_total Count of positive increases in the size of the old generation memory pool before GC to after GC
-
# TYPE jvm_gc_memory_promoted_bytes_total counter
-
jvm_gc_memory_promoted_bytes_total 1.3522328E7
-
# HELP jvm_threads_live The current number of live threads including both daemon and non-daemon threads
-
# TYPE jvm_threads_live gauge
-
jvm_threads_live 26.0
-
# HELP jvm_memory_used_bytes The amount of used memory
-
# TYPE jvm_memory_used_bytes gauge
-
jvm_memory_used_bytes{area="nonheap",id="Code Cache",} 1.4794432E7
-
jvm_memory_used_bytes{area="nonheap",id="Metaspace",} 4.2796624E7
-
jvm_memory_used_bytes{area="nonheap",id="Compressed Class Space",} 5667112.0
-
jvm_memory_used_bytes{area="heap",id="PS Eden Space",} 1.25566672E8
-
jvm_memory_used_bytes{area="heap",id="PS Survivor Space",} 6019288.0
-
jvm_memory_used_bytes{area="heap",id="PS Old Gen",} 2.1067888E7
-
# HELP tomcat_sessions_rejected_total
-
# TYPE tomcat_sessions_rejected_total counter
-
tomcat_sessions_rejected_total 0.0
-
# HELP jvm_classes_unloaded_total The total number of classes unloaded since the Java virtual machine has started execution
-
# TYPE jvm_classes_unloaded_total counter
-
jvm_classes_unloaded_total 0.0
-
# HELP tomcat_global_request_seconds
-
# TYPE tomcat_global_request_seconds summary
-
tomcat_global_request_seconds_count{name="http-nio-8080",} 4.0
-
tomcat_global_request_seconds_sum{name="http-nio-8080",} 2.004
-
# HELP tomcat_sessions_expired_total
-
# TYPE tomcat_sessions_expired_total counter
-
tomcat_sessions_expired_total 0.0
-
# HELP tomcat_global_sent_bytes_total
-
# TYPE tomcat_global_sent_bytes_total counter
-
tomcat_global_sent_bytes_total{name="http-nio-8080",} 10530.0
-
# HELP process_cpu_usage The "recent cpu usage" for the Java Virtual Machine process
-
# TYPE process_cpu_usage gauge
-
process_cpu_usage 0.0012014831697775476
-
# HELP tomcat_global_request_max_seconds
-
# TYPE tomcat_global_request_max_seconds gauge
-
tomcat_global_request_max_seconds{name="http-nio-8080",} 1.766
-
# HELP tomcat_sessions_active_max
-
# TYPE tomcat_sessions_active_max gauge
-
tomcat_sessions_active_max 0.0
-
# HELP process_start_time_seconds Start time of the process since unix epoch.
-
# TYPE process_start_time_seconds gauge
-
process_start_time_seconds 1.536892441333E9
-
# HELP tomcat_threads_busy
-
# TYPE tomcat_threads_busy gauge
-
tomcat_threads_busy{name="http-nio-8080",} 1.0
-
# HELP jvm_threads_peak The peak live thread count since the Java virtual machine started or peak was reset
-
# TYPE jvm_threads_peak gauge
-
jvm_threads_peak 30.0
-
# HELP http_server_requests_seconds
-
# TYPE http_server_requests_seconds summary
-
http_server_requests_seconds_count{exception="None",method="GET",status="404",uri="/**",} 2.0
-
http_server_requests_seconds_sum{exception="None",method="GET",status="404",uri="/**",} 0.025440234
-
http_server_requests_seconds_count{exception="None",method="GET",status="200",uri="/actuator/prometheus",} 1.0
-
http_server_requests_seconds_sum{exception="None",method="GET",status="200",uri="/actuator/prometheus",} 1.764922568
-
http_server_requests_seconds_count{exception="None",method="GET",status="200",uri="/actuator/metrics",} 1.0
-
http_server_requests_seconds_sum{exception="None",method="GET",status="200",uri="/actuator/metrics",} 0.05432998
-
# HELP http_server_requests_seconds_max
-
# TYPE http_server_requests_seconds_max gauge
-
http_server_requests_seconds_max{exception="None",method="GET",status="404",uri="/**",} 0.021481453
-
http_server_requests_seconds_max{exception="None",method="GET",status="200",uri="/actuator/prometheus",} 1.764922568
-
http_server_requests_seconds_max{exception="None",method="GET",status="200",uri="/actuator/metrics",} 0.05432998
-
# HELP process_uptime_seconds The uptime of the Java virtual machine
-
# TYPE process_uptime_seconds gauge
-
process_uptime_seconds 159.907
-
# HELP tomcat_cache_hit_total
-
# TYPE tomcat_cache_hit_total counter
-
tomcat_cache_hit_total 0.0
-
# HELP tomcat_threads_current
-
# TYPE tomcat_threads_current gauge
-
tomcat_threads_current{name="http-nio-8080",} 10.0
-
# HELP logback_events_total Number of error level events that made it to the logs
-
# TYPE logback_events_total counter
-
logback_events_total{level="error",} 0.0
-
logback_events_total{level="warn",} 0.0
-
logback_events_total{level="info",} 42.0
-
logback_events_total{level="debug",} 0.0
-
logback_events_total{level="trace",} 0.0
-
# HELP jvm_buffer_memory_used_bytes An estimate of the memory that the Java virtual machine is using for this buffer pool
-
# TYPE jvm_buffer_memory_used_bytes gauge
-
jvm_buffer_memory_used_bytes{id="direct",} 57344.0
-
jvm_buffer_memory_used_bytes{id="mapped",} 0.0
-
# HELP jvm_classes_loaded The number of classes that are currently loaded in the Java virtual machine
-
# TYPE jvm_classes_loaded gauge
-
jvm_classes_loaded 8460.0
-
# HELP tomcat_threads_config_max
-
# TYPE tomcat_threads_config_max gauge
-
tomcat_threads_config_max{name="http-nio-8080",} 200.0
-
# HELP jvm_buffer_total_capacity_bytes An estimate of the total capacity of the buffers in this pool
-
# TYPE jvm_buffer_total_capacity_bytes gauge
-
jvm_buffer_total_capacity_bytes{id="direct",} 57344.0
-
jvm_buffer_total_capacity_bytes{id="mapped",} 0.0
-
# HELP tomcat_servlet_error_total
-
# TYPE tomcat_servlet_error_total counter
-
tomcat_servlet_error_total{name="default",} 0.0
-
# HELP system_cpu_usage The "recent cpu usage" for the whole system
-
# TYPE system_cpu_usage gauge
-
system_cpu_usage 0.16353571056981642
-
# HELP jvm_gc_memory_allocated_bytes_total Incremented for an increase in the size of the young generation memory pool after one GC to before the next
-
# TYPE jvm_gc_memory_allocated_bytes_total counter
-
jvm_gc_memory_allocated_bytes_total 1.9545368E8
-
# HELP tomcat_global_received_bytes_total
-
# TYPE tomcat_global_received_bytes_total counter
-
tomcat_global_received_bytes_total{name="http-nio-8080",} 0.0
-
# HELP tomcat_sessions_active_current
-
# TYPE tomcat_sessions_active_current gauge
-
tomcat_sessions_active_current 0.0
-
# HELP system_cpu_count The number of processors available to the Java virtual machine
-
# TYPE system_cpu_count gauge
-
system_cpu_count 4.0
-
# HELP tomcat_sessions_created_total
-
# TYPE tomcat_sessions_created_total counter
-
tomcat_sessions_created_total 0.0
-
# HELP jvm_buffer_count An estimate of the number of buffers in the pool
-
# TYPE jvm_buffer_count gauge
-
jvm_buffer_count{id="direct",} 7.0
-
jvm_buffer_count{id="mapped",} 0.0
-
# HELP jvm_memory_max_bytes The maximum amount of memory in bytes that can be used for memory management
-
# TYPE jvm_memory_max_bytes gauge
-
jvm_memory_max_bytes{area="nonheap",id="Code Cache",} 2.5165824E8
-
jvm_memory_max_bytes{area="nonheap",id="Metaspace",} -1.0
-
jvm_memory_max_bytes{area="nonheap",id="Compressed Class Space",} 1.073741824E9
-
jvm_memory_max_bytes{area="heap",id="PS Eden Space",} 6.80525824E8
-
jvm_memory_max_bytes{area="heap",id="PS Survivor Space",} 1.4155776E7
-
jvm_memory_max_bytes{area="heap",id="PS Old Gen",} 1.417674752E9
-
# HELP jvm_gc_live_data_size_bytes Size of old generation memory pool after a full GC
-
# TYPE jvm_gc_live_data_size_bytes gauge
-
jvm_gc_live_data_size_bytes 2.1063808E7
-
# HELP jvm_threads_daemon The current number of live daemon threads
-
# TYPE jvm_threads_daemon gauge
-
jvm_threads_daemon 24.0
-
# HELP tomcat_servlet_request_seconds
-
# TYPE tomcat_servlet_request_seconds summary
-
tomcat_servlet_request_seconds_count{name="default",} 0.0
-
tomcat_servlet_request_seconds_sum{name="default",} 0.0
-
# HELP tomcat_servlet_request_max_seconds
-
# TYPE tomcat_servlet_request_max_seconds gauge
-
tomcat_servlet_request_max_seconds{name="default",} 0.0
-
# HELP tomcat_global_error_total
-
# TYPE tomcat_global_error_total counter
-
tomcat_global_error_total{name="http-nio-8080",} 2.0
-
# HELP jvm_gc_max_data_size_bytes Max size of old generation memory pool
-
# TYPE jvm_gc_max_data_size_bytes gauge
-
jvm_gc_max_data_size_bytes 1.417674752E9
-
# HELP tomcat_sessions_alive_max_seconds
-
# TYPE tomcat_sessions_alive_max_seconds gauge
-
tomcat_sessions_alive_max_seconds 0.0
SpringBoot2.0 Actuator监控指标分析
序号 | 参数 | 参数说明 | 是否监控 | 监控手段 | 重要度 |
---|---|---|---|---|---|
--- | JVM | --- | |||
1 | jvm.memory.max | JVM最大内存 | |||
2 | jvm.memory.committed | JVM可用内存 | 是 | 展示并监控堆内存和Metaspace | 重要 |
3 | jvm.memory.used | JVM已用内存 | 是 | 展示并监控堆内存和Metaspace | 重要 |
4 | jvm.buffer.memory.used | JVM缓冲区已用内存 | |||
5 | jvm.buffer.count | 当前缓冲区数 | |||
6 | jvm.threads.daemon | JVM守护线程数 | 是 | 显示在监控页面 | |
7 | jvm.threads.live | JVM当前活跃线程数 | 是 | 显示在监控页面;监控达到阈值时报警 | 重要 |
8 | jvm.threads.peak | JVM峰值线程数 | 是 | 显示在监控页面 | |
9 | jvm.classes.loaded | 加载classes数 | |||
10 | jvm.classes.unloaded | 未加载的classes数 | |||
11 | jvm.gc.memory.allocated | GC时,年轻代分配的内存空间 | |||
12 | jvm.gc.memory.promoted | GC时,老年代分配的内存空间 | |||
13 | jvm.gc.max.data.size | GC时,老年代的最大内存空间 | |||
14 | jvm.gc.live.data.size | FullGC时,老年代的内存空间 | |||
15 | jvm.gc.pause | GC耗时 | 是 | 显示在监控页面 | |
--- | TOMCAT | --- | |||
16 | tomcat.sessions.created | tomcat已创建session数 | |||
17 | tomcat.sessions.expired | tomcat已过期session数 | |||
18 | tomcat.sessions.active.current | tomcat活跃session数 | |||
19 | tomcat.sessions.active.max | tomcat最多活跃session数 | 是 | 显示在监控页面,超过阈值可报警或者进行动态扩容 | 重要 |
20 | tomcat.sessions.alive.max.second | tomcat最多活跃session数持续时间 | |||
21 | tomcat.sessions.rejected | 超过session最大配置后,拒绝的session个数 | 是 | 显示在监控页面,方便分析问题 | |
22 | tomcat.global.error | 错误总数 | 是 | 显示在监控页面,方便分析问题 | |
23 | tomcat.global.sent | 发送的字节数 | |||
24 | tomcat.global.request.max | request最长时间 | |||
25 | tomcat.global.request | 全局request次数和时间 | |||
26 | tomcat.global.received | 全局received次数和时间 | |||
27 | tomcat.servlet.request | servlet的请求次数和时间 | |||
28 | tomcat.servlet.error | servlet发生错误总数 | |||
29 | tomcat.servlet.request.max | servlet请求最长时间 | |||
30 | tomcat.threads.busy | tomcat繁忙线程 | 是 | 显示在监控页面,据此检查是否有线程夯住 | |
31 | tomcat.threads.current | tomcat当前线程数(包括守护线程) | 是 | 显示在监控页面 | 重要 |
32 | tomcat.threads.config.max | tomcat配置的线程最大数 | 是 | 显示在监控页面 | 重要 |
33 | tomcat.cache.access | tomcat读取缓存次数 | |||
34 | tomcat.cache.hit | tomcat缓存命中次数 | |||
--- | CPU... | --- | |||
35 | system.cpu.count | CPU数量 | |||
36 | system.load.average.1m | load average | 是 | 超过阈值报警 | 重要 |
37 | system.cpu.usage | 系统CPU使用率 | |||
38 | process.cpu.usage | 当前进程CPU使用率 | 是 | 超过阈值报警 | |
39 | http.server.requests | http请求调用情况 | 是 | 显示10个请求量最大,耗时最长的URL;统计非200的请求量 | 重要 |
40 | process.uptime | 应用已运行时间 | 是 | 显示在监控页面 | |
41 | process.files.max | 允许最大句柄数 | 是 | 配合当前打开句柄数使用 | |
42 | process.start.time | 应用启动时间点 | 是 | 显示在监控页面 | |
43 | process.files.open | 当前打开句柄数 | 是 | 监控文件句柄使用率,超过阈值后报警 | 重要 |
附录
SpringBoot1.x的metrics信息
-
{
-
"mem":568029,
-
"mem.free":164376,
-
"processors":16,
-
"instance.uptime":223643888,
-
"uptime":223835542,
-
"systemload.average":0.18,
-
"heap.committed":415232,
-
"heap.init":419840,
-
"heap.used":250855,
-
"heap":415232,
-
"nonheap.committed":156480,
-
"nonheap.init":2496,
-
"nonheap.used":152797,
-
"nonheap":0,
-
"threads.peak":114,
-
"threads.daemon":99,
-
"threads.totalStarted":234,
-
"threads":113,
-
"classes":15474,
-
"classes.loaded":15474,
-
"classes.unloaded":0,
-
"gc.ps_scavenge.count":1808,
-
"gc.ps_scavenge.time":99224,
-
"gc.ps_marksweep.count":3,
-
"gc.ps_marksweep.time":5314
-
}
SpringBoot2.0的metrics信息
-
{
-
"names":[
-
"jvm.memory.committed",
-
"http.server.requests",
-
"jvm.buffer.memory.used",
-
"jvm.gc.memory.allocated",
-
"tomcat.sessions.created",
-
"tomcat.sessions.expired",
-
"jvm.memory.used",
-
"tomcat.global.error",
-
"jvm.gc.max.data.size",
-
"system.cpu.count",
-
"jvm.memory.max",
-
"tomcat.global.sent",
-
"jvm.buffer.total.capacity",
-
"jvm.buffer.count",
-
"process.files.max",
-
"jvm.threads.daemon",
-
"process.start.time",
-
"tomcat.sessions.active.max",
-
"tomcat.global.request.max",
-
"jvm.gc.live.data.size",
-
"process.files.open",
-
"process.cpu.usage",
-
"tomcat.threads.current",
-
"tomcat.servlet.request",
-
"jvm.gc.pause",
-
"process.uptime",
-
"tomcat.threads.busy",
-
"system.load.average.1m",
-
"tomcat.cache.hit",
-
"tomcat.global.request",
-
"tomcat.servlet.error",
-
"tomcat.servlet.request.max",
-
"tomcat.cache.access",
-
"tomcat.sessions.active.current",
-
"system.cpu.usage",
-
"jvm.threads.live",
-
"jvm.classes.loaded",
-
"jvm.classes.unloaded",
-
"jvm.threads.peak",
-
"tomcat.threads.config.max",
-
"jvm.gc.memory.promoted",
-
"tomcat.sessions.rejected",
-
"tomcat.global.received",
-
"tomcat.sessions.alive.max"
-
]
-
}