问题描述: 使用异步IO 访问hbase, hbase需要kerberos验证,kerberos验证的时候,需要把kerberos验证文件加载到分布式缓存中,但是flink异步IO不支持访问分布式缓存,报错信息如下:
好了,不说废话,直接上解决方案:
在执行异步io操作之前,使用map方法构建对hbase的连接,使用静态变量,创建的hbase连接存储在内存中,后续使用hbase客户端的操作可以直接从taskManager的内存中去获取对应的连接即可
主要测试代码:
import java.util.Properties
import java.util.concurrent.TimeUnit
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.scala.AsyncDataStream
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
/**
* hbase: async io
*/
object RealTimeMetric2 {
def main(args: Array[String]): Unit = {
val params: ParameterTool = ParameterTool.fromArgs(args)
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 加载kerbers
val keyTabPath: String = params.get("keytabPath")
env.registerCachedFile(keyTabPath, "keytab")
env.getConfig.setGlobalJobParameters(params)
env.setParallelism(6)
val properties = new Properties()
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("group.id", "test_async")
val source: FlinkKafkaConsumer011[String] = new FlinkKafkaConsumer011[String]("async_test_zyh", new SimpleStringSchema, properties)
// 从最早的offset消费
// source.setStartFromEarliest()
// 从最新的开始消费
source.setStartFromLatest()
// 从当前十分钟之前开始执行
// source.setStartFromTimestamp(TimesUtils.getBeforeTenMinuteTimesTamp)
val dataStream2: DataStream[String] = env.addSource(source)
// 由于异步io内不支持访问分布式缓存,也就没有办法通过hbase的kerberos验证,
// 使用如下方式进行解决:
// 在这个map操作里仅仅验证hbase kerberos, 对于同一个taskmanager,
// 后面对hbase的访问,就不再需要kerberos验证(仅仅对异步io的操作有效)
val dataStream: DataStream[String] = dataStream2
.map(new MyRichMapFunction)
.disableChaining()
// 返回的还是一个DataStream
// Timeout: 超时时间 默认异步I/O请求超时时,会引发异常并重启或停止作业。 如果要处理超时,可以重写AsyncFunction#timeout方法。
// Capacity: 并发请求数量
// unorderedWait 和 orderedWait 是否保证返回的顺序
val result: DataStream[String] = AsyncDataStream.unorderedWait(dataStream,
new DimClassHBaseAsyncFunction(params),
120000, // 设置延迟超时120s
TimeUnit.MILLISECONDS,
40)
.disableChaining()
// 结果输出
result.print("result").setParallelism(1)
val result2: DataStream[String] =
dataStream.map(new DimensionJoinRichMapFunction).disableChaining()
result2.print("result2").setParallelism(1)
env.execute()
}
}
map操作,进行kerberos验证,并创建对应的连接:
import java.util
import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.configuration.Configuration
import org.apache.hadoop.hbase.client.Connection
import pro.hbase.{HBaseClient, HBaseConnectionFactory}
/**
* 这个方法就创建一下hbase的链接,加载一下hbase的分布式缓存验证文件
*/
class MyRichMapFunction extends RichMapFunction[String, String] {
private var hBaseClient: HBaseClient = _
override def open(parameters: Configuration): Unit = {
val parameterTool: ParameterTool = getRuntimeContext.getExecutionConfig.getGlobalJobParameters.asInstanceOf[ParameterTool]
val map: util.Map[String, String] = parameterTool.toMap
val hashMap: util.HashMap[String, String] = new util.HashMap[String, String]
val keyTabPath: String = getRuntimeContext.getDistributedCache.getFile("keytab").getPath
hashMap.putAll(map)
hashMap.put("keytab.file", keyTabPath)
hashMap.put("kerberos_KeyTabPath", keyTabPath)
val connection: Connection = HBaseConnectionFactory.getConnection(hashMap)
hBaseClient = new HBaseClient(connection)
}
override def map(in: String): String = {
in
}
}
对应的工具类:
public class HBaseConnectionFactory {
// 静态属性,每个taskManager 保留一份
private static HashMap<String, Connection> poolFactory = new HashMap<String, Connection>();
/**
* 静态方法
* @param map
* @return
* @throws IOException
*/
public static Connection getConnection(Map<String, String> map) throws IOException {
String clusterName = map.get("cluster_name");
if (clusterName != null && !poolFactory.containsKey(clusterName)) {
synchronized (HBaseConnectionFactory.class) {
if (!poolFactory.containsKey(clusterName)) {
Configuration conf = new Configuration();
//setting config
for (Map.Entry<String, String> entry : map.entrySet()) {
if (!entry.getKey().equals("isKerberos")
|| !entry.getKey().equals("kerberos_UserName")
|| !entry.getKey().equals("kerberos_KeyTabPath")
|| !entry.getKey().equals("cluster_name")) {
conf.set(entry.getKey(), entry.getValue());
}
}
// create conn
Object isKerberos = map.get("isKerberos");
if (isKerberos != null && "true".equalsIgnoreCase(isKerberos.toString())) {
UserGroupInformation.setConfiguration(conf);
UserGroupInformation.loginUserFromKeytab(map.get("kerberos_UserName"), map.get("kerberos_KeyTabPath"));
User user = User.create(UserGroupInformation.getLoginUser());
poolFactory.put(clusterName, ConnectionFactory.createConnection(conf, user));
} else {
poolFactory.put(clusterName, ConnectionFactory.createConnection(conf));
}
}
}
}
return poolFactory.get(clusterName);
}
}
异步操作:
import java.util
import java.util.concurrent.{CompletableFuture, ExecutorService, Executors, TimeUnit}
import java.util.function.{Consumer, Supplier}
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.shaded.guava18.com.google.common.cache.{Cache, CacheBuilder}
import org.apache.flink.streaming.api.scala.async.{AsyncFunction, ResultFuture}
import org.apache.hadoop.hbase.client.Connection
import pro.hbase.{HBaseClient, HBaseConnectionFactory}
import pro.utils.RowKeyUtil
/**
* HBase 维表
*/
class DimClassHBaseAsyncFunction(parameterTool: ParameterTool) extends AsyncFunction[String, String] {
//配置hBase连接
val map: util.Map[String, String] = parameterTool.toMap
// 这几个属性不支持序列化,添加lazy属性
// connection 在前面的map方法已经验证过,
// 这里 connection 可以直接根据parameterTool传递的cluster_name从HBaseConnectionFactory中获取到。
private lazy val connection: Connection = HBaseConnectionFactory.getConnection(map)
private lazy val client: HBaseClient = new HBaseClient(connection)
private lazy val service: ExecutorService = Executors.newFixedThreadPool(50)
// lru缓存
private lazy val cache: Cache[String, String] = CacheBuilder.newBuilder()
.maximumSize(2000) // 最多存储2000条
.expireAfterWrite(120, TimeUnit.SECONDS) // 过期时间为2分钟
.build()
/**
* 异步处理
* @param input
* @param resultFuture
*/
override def asyncInvoke(input: String, resultFuture: ResultFuture[String]): Unit = {
println(s"hbase connection : $connection")
println(s"hbase 客户端是否可用 ${client.exitTable("wx_rt:dim_class")}")
val rowKey: String = RowKeyUtil.reverse(input)
// 读缓存
val cacheClassName: String = cache.getIfPresent(rowKey)
// 如果缓存获取失败,再从hbase获取维度数据
if(cacheClassName != null) {
println(s"从缓存中获取到数据 $input。。。")
resultFuture.complete(Array(cacheClassName))
} else {
println(s"未从缓存获取到数据 $input。。。")
CompletableFuture.supplyAsync(new Supplier[String] {
override def get(): String = {
val dimensionMap: util.HashMap[String, String] =
client.getData(rowKey, "wx_rt:dim_class", "f")
val value: String = dimensionMap.getOrDefault("class_name", "null")
println(s"获取到数据结果: { $input : $value }")
// 加缓存
if(null != value) {
cache.put(rowKey, value)
}
value
}
}, service)
.thenAccept(new Consumer[String] {
override def accept(result: String): Unit = {
println(s"获取到数据结果2 : { $input : $result }")
resultFuture.complete(Array(result))
}
})
}
}
/**
* 超时处理
* @param input
* @param resultFuture
*/
override def timeout(input: String, resultFuture: ResultFuture[String]): Unit = {
println(s"time out error $input !!!!")
// 这里输出 null,会出现空指针
resultFuture.complete(Array("null"))
}
}
好了,这只是我在开发时的一个临时解决方案,相信还有更好的方法,可以一起留言交流,互相学习。