基于Flink1.9,统计某个日志中每小时的数据量,并输出到MySQL。
主函数
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);//时间为事件时间
env.enableCheckpointing(500);//设置检查点
Properties properties = Property.getKafkaProperties(Constants.GET_USER_AUTH_NAME);
FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<String>(Constants.TOPIC_USER_AUTH, new SimpleStringSchema(), properties);
consumer.setStartFromGroupOffsets();
DataStream<String> edits = env.addSource(consumer).name("user_auth_edit");
DataStream<UserAuth> calStream = edits
.filter(new FilterFunction<String>() { ///把时间异常过滤,防止因时间异常阻塞
@Override
public boolean filter(String s) throws Exception {
try {
JSONObject jsonObject = JSONObject.parseObject(s);
String ts = jsonObject.getString("ts");
Pattern resultPattern = Pattern.compile("\\d{10}");
Matcher resultMatcher = resultPattern.matcher(ts);
//过滤时间大于现在的,避免异常的未来数据影响到时间窗口。容忍一个小时的时间差
if (!resultMatcher.matches()) {
return false;
}
int now = (int) (System.currentTimeMillis() / 1000) + 30 * 60;//避免服务器时间不准确,设立时间差
int time = Integer.parseInt(ts);
return now >= time;
} catch (Exception e) {
System.out.println("Filter failed");
System.err.println(e + " " + e.getMessage());
return false;
}
}
})
.assignTimestampsAndWatermarks(new TaskTimestamp(Time.hours(0)))
.windowAll(TumblingEventTimeWindows.of(Time.hours(1)))
.allowedLateness(Time.hours(2))
.aggregate(new UserAuthCount());
calStream.addSink(new SinkToMySQL());
env.execute("executed user auth");
System.out.println("executed");
}
UserAuth.java
public class UserAuthCount implements AggregateFunction<String, UserAuth, UserAuth> {
@Override
public UserAuth createAccumulator()
{
return new UserAuth(0L);
}
@Override
public UserAuth add(String s, UserAuth userAuth) {
if(userAuth.getNumbers()==0L){
JSONObject jsonObject = JSONObject.parseObject(s);
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHH");
String time_hour = sdf.format(new Date(Long.valueOf(jsonObject.getString("ts")) * 1000L));
userAuth.setTime_hour(time_hour);
userAuth.setNumbers(1L);
}
else{
userAuth.setNumbers(userAuth.getNumbers()+1L);
}
return userAuth;
}
@Override
public UserAuth getResult(UserAuth userAuth)
{
System.out.println("result is "+userAuth);
return userAuth;
}
@Override
public UserAuth merge(UserAuth userAuth, UserAuth acc1) {
return null;
}
}
解读:Window窗口AggregateFunction 实现消息事件的次数和累计值
@param <IN> The type of the values that are aggregated (input values) 可以理解为输入流数据类型,例子中为SimpleEvent
@param <ACC> The type of the accumulator (intermediate aggregate state). accumulator累加器的类别,本例中为一个复合类,包括key,count,sum分别对应ID_NO,事件次数,时间累计值(总金额)
@param <OUT> The type of the aggregated result 聚合结果类别
public interface AggregateFunction<IN, ACC, OUT> extends Function, Serializable
SinkToMySQL.java
public class SinkToMySQL extends RichSinkFunction<UserAuth> implements SinkFunction<UserAuth> {
private static PreparedStatement ps;
private static PreparedStatement selectps;
private static PreparedStatement updateps;
private static Connection connection = null;
/**
* open() 方法中建立连接,这样不用每次 invoke 的时候都要建立连接和释放连接
*
* @param parameters
* @throws Exception
*/
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
try {
connection = getConnection();
String selectSql = "select id,time_hour, numbers from auth_time where time_hour = ?;";
selectps = connection.prepareStatement(selectSql);
String sql = "insert into auth_time(time_hour, numbers,create_time,update_time) values(?, ?,?,?);";
ps = connection.prepareStatement(sql);
String updateSql = "update auth_time set numbers = ?,update_time = ? where time_hour = ?;";
updateps = connection.prepareStatement(updateSql);
} catch (Exception e) {
System.out.println("sinkToMysql open is error " + e.getMessage());
}
}
private static Connection getConnection() {
try {
//加载数据库驱动
Class.forName(Property.getStrValue("mysql.classname"));
String url = Property.getMysqlUrl();
//获取连接
connection = DriverManager.getConnection(url);
System.out.println("数据库连接建立成功");
} catch (Exception e) {
System.out.println("-----------mysql get connection has exception , msg = " + e.getMessage());
}
return connection;
}
@Override
public void close() throws Exception {
super.close();
//关闭连接和释放资源
try {
if (ps != null) {
System.out.println("关闭插入ps");
ps.close();
}
if (selectps != null) {
System.out.println("关闭查询ps");
selectps.close();
}
if (updateps != null) {
System.out.println("关闭更新ps");
updateps.close();
}
if (connection != null && !connection.isClosed()) {
System.out.println("关闭数据库连接");
connection.close();
}
} catch (Exception e) {
System.out.println("close mysql is error "+e.getMessage());
}
}
/**
* 每条数据的插入都要调用一次 invoke() 方法
*
* @param userAuth
* @param context
* @throws Exception
*/
@Override
public void invoke(UserAuth userAuth, Context context) throws SQLException {
ResultSet resultSet = null;
try {
//判断是否连接,尝试重连
try{
selectps.setString(1, userAuth.getTime_hour());
}catch (MySQLNonTransientConnectionException e){
System.out.println("mysql 连接过期,重新连接");
}
//遍历数据集合
selectps.setString(1, userAuth.getTime_hour());
resultSet = selectps.executeQuery();
if (!resultSet.next()) {
ps.setString(1, userAuth.getTime_hour());
ps.setLong(2, userAuth.getNumbers());
ps.setInt(3, (int) (System.currentTimeMillis() / 1000));
ps.setInt(4, (int) (System.currentTimeMillis() / 1000));
System.out.println(ps);
ps.execute();
} else {
System.out.println("已有该时段数据");
long resultAuthNum = resultSet.getLong("numbers");
if (resultAuthNum < userAuth.getNumbers()) {
System.out.println("Add number");
updateps.setLong(1, userAuth.getNumbers());
updateps.setInt(2, (int) (System.currentTimeMillis() / 1000));
updateps.setString(3, userAuth.getTime_hour());
System.out.println(updateps);
updateps.execute();
}
}
} catch (Exception e) {
System.err.println("invoke is error " + e.getMessage());
} finally {
if (resultSet != null){
resultSet.close();
}
}
}
解读:此处为自定义sinkFunction,继承AbstractRichFunction,是一个抽象类,实现了RichFunction接口。
1、open方法,进行初始化;2、invoke方法,进行record输出
测试:
模拟向kafka中生产13:00到21:00的数据,数据结构如下:
{
"subject_id": "test",
"subject_name": "test",
"resource_id": "test",
"client_ip": "234.215.14.137",
"timestamp": "2020-03-17T16:02:32+0800",
"ts": "1584432152"
}