Flink 是流计算引擎
文章原址:https://blog.csdn.net/m0_49826240/article/details/114053579
在Flink任务开发过程中经常会遇到从外部存储获取数据的场景,比如处理日志数据时,需要获取到用户的信息。
最近在做一个指标,用到了异步IO,借此来记录下Flink Async I/O 读取MySql的方式。
需求:用户的行为数据通过Stream进入flink计算引擎,需要知道每个用户的注册时间、最近一次登录时间。
用户注册时间和登录时间是存放在User表中
User 表
-- user table
CREATE TABLE `mall_user` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`userId` bigint(11) DEFAULT NULL,
`registerTime` bigint(11) DEFAULT NULL,
`lastLoadTime` bigint(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=armscii8
-- 数据写入
INSERT INTO `mall_user` VALUES (1, 10086, 1614009599, 1616860800);
INSERT INTO `mall_user` VALUES (2, 10010, 1614009600, 1616861600);
User 类
@Data
public class User {
/**
* userId
*/
private Long userId;
/**
* 注册时间
*/
private Long registerTime;
/**
* 上次登录时间
*/
private Long lastLoadTime;
}
数据流,采用读取指定端口来获得。
直接上代码,根据代码来讲解
1.main方法:
从 9980 端口读取数据后,转为User对象,此时user对象中只有userid,没有两个时间的值
关联维表获取两个时间值,取数据这个过程不能太长,所以设置了异步读取数据的超时时间为3秒。
import com.zyx.bigdata.flink.streaming.asynio.entity.User;
import com.zyx.bigdata.flink.streaming.asynio.operator.GetUserInfoByUserIdAsyncFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.util.concurrent.TimeUnit;
/**
* @Author zyx
* @Date 2020-10-20 15:23
*/
public class AsynGetUserInfoJon {
public static void main(String[] args) throws Exception {
// 获取 flink env
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 因为本地测试,并行度设置为1
env.setParallelism(1);
// 从 9980 端口读取数据,传入的数据为userid
SingleOutputStreamOperator<User> localhost =
env.socketTextStream("localhost", 9980, "\n")
.flatMap(new FlatMapFunction<String, User>() {
@Override
public void flatMap(String value, Collector<User> out) throws Exception {
try {
long userId = Long.parseLong(value);
User user = new User();
user.setUserId(userId);
out.collect(user); // 此时吐出的数据中只有userid
} catch (Exception e) {
System.out.println("error "+ e);
}
}
});
// 关联维表 mall_user
SingleOutputStreamOperator<User> dataEntitySingleOutputStreamOperator = AsyncDataStream
// 3,TimeUnit.SECONDS 表示超时时间为3秒
.unorderedWait(localhost, new GetUserInfoByUserIdAsyncFunction(), 3, TimeUnit.SECONDS);
// 数据打印
dataEntitySingleOutputStreamOperator.print();
env.execute("Flink_Async_IO_Test");
}
}
2.GetUserInfoByUserIdAsyncFunction 类,继承了 RichAsyncFunction 类。
这里需要添加两个依赖
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-sql-common</artifactId>
<version>3.8.5</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-jdbc-client</artifactId>
<version>3.8.5</version>
<scope>compile</scope>
</dependency>
此处需要注意的点:
- 缓存的设置可以减少读取外部存储的次数
- 是否获取到外部数据,都需要将数据 out 出来,防止线程直占用
- 出现异常、超时都需要将数据吐出来。
- 从外部数据读取完后,记得 conn.close()
- 不明白的可以留言。后续出异步IO的源码阅读
package com.zyx.bigdata.flink.streaming.asynio.operator;
import com.zyx.bigdata.flink.streaming.asynio.entity.User;
import io.vertx.core.AsyncResult;
import io.vertx.core.Handler;
import io.vertx.core.Vertx;
import io.vertx.core.VertxOptions;
import io.vertx.core.json.JsonObject;
import io.vertx.ext.jdbc.JDBCClient;
import io.vertx.ext.sql.ResultSet;
import io.vertx.ext.sql.SQLClient;
import io.vertx.ext.sql.SQLConnection;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.shaded.guava18.com.google.common.cache.Cache;
import org.apache.flink.shaded.guava18.com.google.common.cache.CacheBuilder;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import java.util.Collections;
import java.util.concurrent.TimeUnit;
/**
* @Author axin
* @Date 2020-10-20 15:29
*/
public class GetUserInfoByUserIdAsyncFunction extends RichAsyncFunction<User, User> {
private transient SQLClient sqlClient;
private transient volatile Cache<Long, User> userInfoCache;
@Override
public void open(Configuration parameters) {
// 异步读取 mysql client
Vertx vertx = Vertx.vertx(new VertxOptions().setWorkerPoolSize(10).setEventLoopPoolSize(5));
JsonObject config = new JsonObject()
.put("url", "jdbc:mysql://localhost:3306/bigdata?serverTimezone=UTC")
.put("driver_class", "com.mysql.jdbc.Driver")
.put("max_pool_size", 20)
.put("user", "root")
.put("password", "root");
sqlClient = JDBCClient.createShared(vertx, config);
// 做缓存,优先读取缓存中数据,没有读到时,再从mysql中获取
userInfoCache = CacheBuilder.newBuilder()
.initialCapacity(500) // 初始化缓存池大小
.maximumSize(1000) // 缓存池最大为1000
.expireAfterWrite(5, TimeUnit.MINUTES).build(); // 写入的数据5分钟后过期,表示失效
}
@Override
public void asyncInvoke(User inputUser, ResultFuture<User> resultFuture) {
long userId = inputUser.getUserId();
// 从缓存中读取该用户的信息
User userFromCache = userInfoCache.getIfPresent(userId);
// 判断读取是否成功
if (userFromCache != null && userFromCache.getLastLoadTime()!=null && userFromCache.getRegisterTime()!=null) {
// 从缓存中成功获取到数据,user 补充时间字段的值
inputUser.setLastLoadTime(userFromCache.getLastLoadTime());
inputUser.setRegisterTime(userFromCache.getRegisterTime());
resultFuture.complete(Collections.singletonList(inputUser));
} else {
// 查询的sql,此处需要注意查询的条件尽量打在索引上,避免查询超时
String sql = "select `lastLoadTime`,`registerTime` from `mall_user` where `userId` = '" + userId + "'";
sqlClient.getConnection(new Handler<AsyncResult<SQLConnection>>() {
@Override
public void handle(AsyncResult<SQLConnection> sqlConnectionAsyncResult) {
SQLConnection conn = sqlConnectionAsyncResult.result();
try {
conn.query(sql, resultSetAsyncResult -> {
if (resultSetAsyncResult.failed()) {
// 获取结果失败,重新执行。此处需要注意控制失败查询次数
asyncInvoke(inputUser, resultFuture);
return;
}
// 查询成功
ResultSet result = resultSetAsyncResult.result();
// 判断取到的值是否有记录
if (result.getRows() != null && result.getNumRows() > 0) {
User userFromDB = new User();
for (JsonObject userInfoFromDB : result.getRows()) {
userFromDB.setUserId(userId);
userFromDB.setRegisterTime(userInfoFromDB.getLong("registerTime"));
userFromDB.setLastLoadTime(userInfoFromDB.getLong("lastLoadTime"));
}
// 存入 cache
userInfoCache.put(userId, userFromDB);
// user 补充时间字段的值
inputUser.setLastLoadTime(userFromDB.getLastLoadTime());
inputUser.setRegisterTime(userFromDB.getRegisterTime());
}
// 返回数据
resultFuture.complete(Collections.singletonList(inputUser));
});
} catch (Exception e) {
// 出现异常,也是需要将数据吐出的,不然会导致异步线程阻塞
resultFuture.complete(Collections.singletonList(inputUser));
System.out.println("数据查询异常 " + e.getMessage());
}
conn.close();
}
});
}
}
@Override
public void timeout( User input, ResultFuture<User> resultFuture) {
// 查询超时的数据,也需要返回出来,不然会导致数据丢失
resultFuture.complete(Collections.singletonList(input));
}
@Override
public void close() {
sqlClient.close();
userInfoCache.invalidateAll();
}
}
数据测试:
Terminal 发送数据到9980端口
zyx@axindeMacBook-Pro ~ % nc -lk 9980
10000
10080
10010
10000
10086
控制台打印到数据:
只有userid = 10086和10010 到user在数据库中有数据,所以关联到时间属性值,10000和10080还是返回到null。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>flink-demo</artifactId>
<groupId>com.zyx.flink</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>module-flink-asynio</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<flink.version>1.10.1</flink.version>
<java.version>1.8</java.version>
<scala.binary.version>2.11</scala.binary.version>
<maven.compiler.source>${java.version}</maven.compiler.source>
<maven.compiler.target>${java.version}</maven.compiler.target>
</properties>
<dependencies>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.7</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
<scope>runtime</scope>
</dependency>
<!-- vertx-jdbc -->
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-sql-common</artifactId>
<version>3.8.5</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>io.vertx</groupId>
<artifactId>vertx-jdbc-client</artifactId>
<version>3.8.5</version>
<scope>compile</scope>
</dependency>
<!-- mysql-connector -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.20</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Java Compiler -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. -->
<!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.0.0</version>
<executions>
<!-- Run shade goal on package phase -->
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<excludes>
<exclude>org.apache.flink:force-shading</exclude>
<exclude>com.google.code.findbugs:jsr305</exclude>
<exclude>org.slf4j:*</exclude>
<exclude>log4j:*</exclude>
</excludes>
</artifactSet>
<filters>
<filter>
<!-- Do not copy the signatures in the META-INF folder.
Otherwise, this might cause SecurityExceptions when using the JAR. -->
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.zyx.bigdata.DistinctDemo2</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<!-- This improves the out-of-the-box experience in Eclipse by resolving some warnings. -->
<plugin>
<groupId>org.eclipse.m2e</groupId>
<artifactId>lifecycle-mapping</artifactId>
<version>1.0.0</version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<versionRange>[3.0.0,)</versionRange>
<goals>
<goal>shade</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore/>
</action>
</pluginExecution>
<pluginExecution>
<pluginExecutionFilter>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<versionRange>[3.1,)</versionRange>
<goals>
<goal>testCompile</goal>
<goal>compile</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore/>
</action>
</pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
<!-- This profile helps to make things run out of the box in IntelliJ -->
<!-- Its adds Flink's core classes to the runtime class path. -->
<!-- Otherwise they are missing in IntelliJ, because the dependency is 'provided' -->
<profiles>
<profile>
<id>add-dependencies-for-IDEA</id>
<activation>
<property>
<name>idea.version</name>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
</profile>
</profiles>
</project>