业务场景1:埋点数据落hive表,且埋点数据中带有数据产生时的时间字段
业务流程:kafka->flume->hdfs->hive
问题:晚到的埋点数据会落到哪个分区中 9点产生的埋点数据 由于数据上报或者flume sink的延迟会落到9点的分区中么? ------->>>答案是不会的
需求:flume消费kafka数据按照server_time时间字段,决定数据落的分区
pom.xml
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
</dependencies>
新建: TimeInterceptor.class
package zm.develop;
import com.alibaba.fastjson.JSON;
import org.apache.commons.compress.utils.Charsets;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Map;
public class TimeInterceptor implements Interceptor {
private static final Logger logger = LoggerFactory.getLogger(TimeInterceptor.class);
private final boolean preserveExisting;
private final String header;
private long timeStamp=0l;
private TimeInterceptor(boolean preserveExisting, String header) {
this.preserveExisting = preserveExisting;
this.header = header;
}
public void initialize() { }
public Event intercept(Event event) {
try{
Map<String, String> headers = event.getHeaders();
//获取event中的server_time(2020-03-20T04:46:42.926+0800)字段对应的时间戳作为timestamp
String line = new String(event.getBody(), Charsets.UTF_8);
String server_time = JSON.parseObject(line).getString("server_time");
// logger.info(server_time);
if(server_time == null || server_time.length() <= 0){
timeStamp = System.currentTimeMillis();
// logger.info("---server_time is null or the length is zero---");
}else {
Date date = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ").parse(server_time);
timeStamp = date.getTime();
// logger.info("----The event server_time is----"+ timeStamp);
}
headers.put("timestamp",Long.toString(timeStamp));
}catch (Exception e){
logger.info(e.toString());
}
return event;
}
public List<Event> intercept(List<Event> events) {
for (Event event : events) {
intercept(event);
}
return events;
}
public void close() { }
public static class Builder implements Interceptor.Builder {
private boolean preserveExisting = false;
private String header = "timestamp";
@Override
public Interceptor build() {
return new TimeInterceptor(preserveExisting, header);
}
@Override
public void configure(Context context) {
preserveExisting = context.getBoolean("preserveExisting", false);
header = context.getString("headerName", "timestamp");
}
}
}
需求2:解析单条数据,输出多条数据
场景:将一条json数组的数据,拆分成多个单条数据
package com.zhangmen;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.compress.utils.Charsets;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* 此拦截器功能:
* 通过将json数据的数据拆分成多条,并去除异常数据
*/
public class TimeInterceptor implements Interceptor{
private static final Logger logger = LoggerFactory.getLogger(TimeInterceptor.class);
private final boolean preserveExisting;
private final String header;
private long timeStamp=0l;
private TimeInterceptor(boolean preserveExisting, String header) {
this.preserveExisting = preserveExisting;
this.header = header;
}
/**
* TODO 可以对程序进行初始化操作
*/
public void initialize() {
}
public Event intercept(Event event) {
Map<String, String> headers = event.getHeaders();
//获取event中的server_time(2020-03-20T04:46:42.926+0800)字段对应的时间戳作为timestamp
String jsonStr = new String(event.getBody(), Charsets.UTF_8);
try {
List<String> jsonList = new ArrayList<>(2);
if(jsonStr != null && !jsonStr.trim().equals("")){
String tag = jsonStr.trim().substring(0,1);
if(tag.equals("[")){//解析json数组
JSONArray objects = JSON.parseArray(jsonStr);
for(int i = 0; i < objects.size(); i++) {
JSONObject jsonObject = objects.getJSONObject(i);
jsonList.add(jsonObject.toString());
}
}else {//解析jsonstr
getTimeStamp(JSON.parseObject(jsonStr));
headers.put("timestamp",Long.toString(timeStamp));
jsonList.add(jsonStr);
}
}
event.setBody(String.join("\n", jsonList).getBytes());
} catch (Exception e) {
event.setBody(null);
logger.error(e.getMessage(), e);
}
return event;
}
private void getTimeStamp(JSONObject jsonObject) {
try {
String server_time = "";
if (jsonObject.keySet().contains("server_time")) {
server_time = jsonObject.getString("server_time");
}
if(server_time == null || server_time.length() <= 0){
timeStamp = System.currentTimeMillis();
}else {
Date date = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ").parse(server_time);
timeStamp = date.getTime();
}
} catch (ParseException e) {
logger.error("解析数据异常", e.getStackTrace());
}
}
public List<Event> intercept(List<Event> events) {
for (Event event : events) {
intercept(event);
}
return events;
}
public void close() {
}
public static class Builder implements Interceptor.Builder {
private boolean preserveExisting = false;
private String header = "timestamp";
@Override
public Interceptor build() {
return new TimeInterceptor(preserveExisting, header);
}
@Override
public void configure(Context context) {
preserveExisting = context.getBoolean("preserveExisting", false);
header = context.getString("headerName", "timestamp");
}
}
}
需求3:从外部传入字段,根据传入字段来获取对应的数据
其中flume.conf 配置如下:
mirana_whiteboard0601.sources.s1.type = org.apache.flume.source.kafka.KafkaSource
mirana_whiteboard0601.sources.s1.kafka.bootstrap.servers = xxxx:9092
mirana_whiteboard0601.sources.s1.kafka.topics = zsq_test
mirana_whiteboard0601.sources.s1.kafka.consumer.group.id = flume_mirana_hdfs
mirana_whiteboard0601.sources.s1.channels = c1
mirana_whiteboard0601.sources.s1.interceptors = i1
#mirana_whiteboard0601.sources.s1.interceptors.i1.type = timestamp
mirana_whiteboard0601.sources.s1.interceptors.i1.type = com.zhangmen.TimeInterceptor$Builder
mirana_whiteboard0601.sources.s1.interceptors.i1.extractorTableColumn = userid,device_id,appid,version,event_para,start_time,server_time
mirana_whiteboard0601.sources.s1.kafka.consumer.timeout.ms = 7000
mirana_whiteboard0601.sources.s1.batchSize=10000
mirana_whiteboard0601.sources.s1.kafka.consumer.max.partition.fetch.bytes = 10485760
#mirana_whiteboard0601.sources.s1.kafka.consumer.auto.offset.reset = earliest
拦截器代码如下:
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.parser.Feature;
import org.apache.commons.compress.utils.Charsets;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
/**
* 此拦截器功能:
* 1、通过匹配传入的字段,按顺序拼接字段值
* 2、如果没有传入字段,默认抽取所有字段,并按照jsonString数据顺序输出
*/
public class TimeInterceptorbak implements Interceptor{
private static final Logger logger = LoggerFactory.getLogger(TimeInterceptorbak.class);
private final boolean preserveExisting;
private final String header;
private final String extractorTableColumn;
private static List<String> columnList = null;
private long timeStamp=0l;
private TimeInterceptorbak(boolean preserveExisting, String header, String extractorTableColumn) {
this.preserveExisting = preserveExisting;
this.header = header;
this.extractorTableColumn = extractorTableColumn;
}
/**
* 初始化输入字段
*/
public void initialize() {
logger.info("=======> 初始化传入表字段列表为:{}", this.extractorTableColumn);
if (extractorTableColumn.length() > 0 && Objects.nonNull(extractorTableColumn)) {
if (extractorTableColumn.contains(",")) {
columnList = Arrays.stream(this.extractorTableColumn.trim().split(",")).collect(Collectors.toList());
} else columnList.add(extractorTableColumn); //考虑单个字段的情况
}
}
public Event intercept(Event event) {
Map<String, String> headers = event.getHeaders();
//获取event中的server_time(2020-03-20T04:46:42.926+0800)字段对应的时间戳作为timestamp
String jsonStr = new String(event.getBody(), Charsets.UTF_8);
JSONObject jsonObject = JSON.parseObject(jsonStr);
getTimeStamp(jsonObject);
headers.put("timestamp",Long.toString(timeStamp));
event.setBody(Json2String(jsonStr).getBytes());
return event;
}
private void getTimeStamp(JSONObject jsonObject) {
try {
String server_time = "";
if (jsonObject.keySet().contains("server_time")) {
server_time = jsonObject.getString("server_time");
}
if(server_time == null || server_time.length() <= 0){
timeStamp = System.currentTimeMillis();
// logger.info("---server_time is null or the length is zero---");
}else {
Date date = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ").parse(server_time);
timeStamp = date.getTime();
// logger.info("----The event server_time is----"+ timeStamp);
}
} catch (ParseException e) {
logger.error("解析数据异常", e.getStackTrace());
}
}
/**
* 按顺序解析字段
* @param json
* @return
*/
private String Json2String(String json) {
ArrayList<String> list = new ArrayList<>();
if (Objects.nonNull(columnList)) {
Map<String, Object> map = JSON.parseObject(json, Map.class);
columnList.stream().forEach(cloumn->{
Object value = map.get(cloumn);
if (Objects.nonNull(value)) {
String str = value.toString().replaceAll("\n", "\\\\n")
.replaceAll("\\001", "");
list.add(str);
} else list.add(null);
});
}else {
//json转成按顺序解析的map格式
HashMap<String, Object> map = JSON.parseObject(json, LinkedHashMap.class, Feature.OrderedField);
map.keySet().stream().forEach(key -> {
list.add(map.get(key).toString());
});
}
String jsonStr = String.join("\001", list);
list.clear();
return jsonStr;
}
public List<Event> intercept(List<Event> events) {
for (Event event : events) {
intercept(event);
}
return events;
}
public void close() {
}
public static class Builder implements Interceptor.Builder {
private boolean preserveExisting = false;
private String header = "timestamp";
private String extractorTableColumn = "";
@Override
public Interceptor build() {
return new TimeInterceptorbak(preserveExisting, header,extractorTableColumn);
}
@Override
public void configure(Context context) {
preserveExisting = context.getBoolean("preserveExisting", false);
header = context.getString("headerName", "timestamp");
extractorTableColumn = context.getString("extractorTableColumn", "");
}
}
}