Flink metrics
最近根据flinkUI的接口追踪了一下flink的metric信息的查询过程,在这里记录一下。代码基于release1.9分支
入口
首先我这里的入口指的是flink UI中的/jobs/:jobid
这个rest接口,具体处理逻辑是在JobDetailsHandler,JobDetailsHandler的注册以及与path的绑定可以看WebMonitorEndpoint部分,这一块不是本文的具体内容,其中查询的逻辑为JobDetailsHandler.handleRequest。
protected JobDetailsInfo handleRequest(
HandlerRequest<EmptyRequestBody, JobMessageParameters> request,
AccessExecutionGraph executionGraph) throws RestHandlerException {
return createJobDetailsInfo(executionGraph, metricFetcher);
}
注意那个metricFeatcher成员变量,那个是查询metric的核心
我们根据返回的JobDetailsInfo对象的具体字段看一下metrics书如何查询的
首先看一下rest接口的响应
{
"jid": "c0f3aa66449fe0ce19b4e03323fee5ae",
"name": "antc4blink788000260",
"isStoppable": false,
"state": "RUNNING",
"start-time": 1591704258551,
"end-time": -1,
"duration": 68423742,
"now": 1591772682293,
...
"vertices": [
{
"id": "717c7b8afebbfb7137f6f0f99beb2a94",
"topology-id": 0,
"name": "Source: GSlsTableSource-dws_behavior_ri_source-Stream -> SourceConversion(table:[builtin, default, _DataStreamTable_0, source: [GSlsTableSource-dws_behavior_ri_source]], fields:(f0)) -> correlate: table(SlsParser_dws_behavior_ri_source0($cor0.f0)), select: user_id,event_time,item_id,biz_type -> Calc(select: (user_id AS userid, event_time AS eventtime, item_id AS itemid, biz_type, ((MD5(user_id) SUBSTR 1 SUBSTR 4) CONCAT '#' CONCAT user_id) AS rowkey_user_id, (CAST((event_time SUBSTR 1 SUBSTR 10)) FROM_UNIXTIME 'yyyyMMdd') AS day_format, ((MD5((CAST((event_time SUBSTR 1 SUBSTR 10)) FROM_UNIXTIME 'yyyyMMdd')) SUBSTR 1 SUBSTR 4) CONCAT '#' CONCAT (CAST((event_time SUBSTR 1 SUBSTR 10)) FROM_UNIXTIME 'yyyyMMdd')) AS rowkey_day_format, (CAST((event_time SUBSTR 1 SUBSTR 10)) FROM_UNIXTIME 'HH') AS hour_format, ((MD5(item_id) SUBSTR 1 SUBSTR 4) CONCAT '_' CONCAT item_id) AS rowkey_item_id)) -> AsyncJoinTable(table: (AliHBase: [myddsczssd_time_sematic]), joinType: LeftOuterJoin, join: (userid, ...",
"parallelism": 128,
"status": "RUNNING",
"start-time": 1591704339321,
"end-time": -1,
"duration": 68342972,
"tasks": {
"RUNNING": 128,
"CANCELED": 0,
"CANCELING": 0,
"FAILED": 0,
"FINISHED": 0,
"CREATED": 0,
"RECONCILING": 0,
"SCHEDULED": 0,
"DEPLOYING": 0
},
// metric信息在这
"metrics": {
"read-bytes": 0,
"read-bytes-complete": true,
"write-bytes": 0,
"write-bytes-complete": true,
"read-records": 0,
"read-records-complete": true,
"write-records": 0,
"write-records-complete": true,
"buffers-in-pool-usage-max": 0.0,
"buffers-in-pool-usage-max-complete": true,
"buffers-out-pool-usage-max": 0.0,
"buffers-out-pool-usage-max-complete": true,
"tps": 0.21666666666666667,
"tps-complete": true,
"delay": 1383,
"delay-complete": true
}
}
],
...
}
找一下其在对应的model的具体字段
public class JobDetailsInfo implements ResponseBody {
@JsonProperty(FIELD_NAME_JOB_ID)
@JsonSerialize(using = JobIDSerializer.class)
private final JobID jobId;
@JsonProperty(FIELD_NAME_JOB_NAME)
private final String name;
@JsonProperty(FIELD_NAME_IS_STOPPABLE)
private final boolean isStoppable;
@JsonProperty(FIELD_NAME_JOB_STATUS)
private final JobStatus jobStatus;
@JsonProperty(FIELD_NAME_START_TIME)
private final long startTime;
@JsonProperty(FIELD_NAME_END_TIME)
private final long endTime;
@JsonProperty(FIELD_NAME_DURATION)
private final long duration;
@JsonProperty(FIELD_NAME_NOW)
private final long now;
@JsonProperty(FIELD_NAME_TIMESTAMPS)
private final Map<JobStatus, Long> timestamps;
// metrics字段在这个类的字段里面
@JsonProperty(FIELD_NAME_JOB_VERTEX_INFOS)
private final Collection<JobVertexDetailsInfo> jobVertexInfos;
@JsonProperty(FIELD_NAME_JOB_VERTICES_PER_STATE)
private final Map<ExecutionState, Integer> jobVerticesPerState;
@JsonProperty(FIELD_NAME_JSON_PLAN)
@JsonRawValue
private