1、Spark Streaming更新广播变量的方式
在Driver端通过累加器数据来一条就判断是否需要更新广播变量,通过这种方式就可以实现定时更新广播变量的方式。
lines.foreachRDD(rdd=>{
// 这里单例模式实例化广播变量
val dimTable = DimTable.getInstance(rdd.sparkContext)
// 这里使用累加器保存上一次更新广播变量的时间
val currentAccumulatorInterval = new Date().getTime - UpdateTimeCount.getInstance(rdd.sparkContext).value
if(currentAccumulatorInterval>20000){
DimTable.update(rdd.sparkContext)
UpdateTimeCount.getInstance(rdd.sparkContext).add(currentAccumulatorInterval)
}
dimTable.value.map(print)
println()
})
2、Structured Streaming更新广播变量的方式
通过spark.streams.addListener()添加一个监听器,在driver端根据Listener在onQueryProgress()触发的时候判断是否需要重新加载数据更新广播变量:
public class CustStreamingQueryListener extends StreamingQueryListener {
private ConfigBean config;
private LongAccumulator nextUpdateTimeStamp = null;
private LongAccumulator curFileModifyTimeStamp = null;
private LoadResourceManager loadResourceManager;
private Logger logger;
public MyStreamingQueryListener(ConfigBean config, LongAccumulator nextUpdateTimeStamp,
LongAccumulator curFileModifyTimeStamp, LoadResourceManager loadResourceManager) {
this.config = config;
this.nextUpdateTimeStamp = nextUpdateTimeStamp;
this.curFileModifyTimeStamp = curFileModifyTimeStamp;
this.loadResourceManager = loadResourceManager;
this.logger = config.getSparkSession().log();
}
@Override
public void onQueryStarted(QueryStartedEvent event) {
}
// 这个方法不是线程安全的
@Override
public void onQueryProgress(QueryProgressEvent event) {
// 这里进行重新加载的逻辑
long currentTimeMillis = System.currentTimeMillis();
if(currentTimeMillis >= nextUpdateTimeStamp.value()) {
synchronized (MyStreamingQueryListener.class) {
if (currentTimeMillis >= nextUpdateTimeStamp.value()) {
String filepath = config.getAppProperties().getProperty("app.ipdim.filepath");
String crondate = config.getAppProperties().getProperty("app.ipdim.crondate");
try {
long lastModified = loadResourceManager.getHdfsLastModify(filepath);
String msg = DateUtil.sysTime() + " the time over, " +
"the new ip file midifytime is [" + DateUtil.format(new Date(lastModified)) + "]";
logger.warn(msg);
// 当从新读取到的文件的修改时间大于上一次文件的修改时间并且不是第一次加载
if (lastModified > curFileModifyTimeStamp.value() && curFileModifyTimeStamp.value() > 0) {
loadResourceManager.load(config, nextUpdateTimeStamp, curFileModifyTimeStamp);
}
// 计算下一次的加载的时间点
Date nextFireTimeStamp = loadResourceManager.getNextFireTimeStamp(crondate);
nextUpdateTimeStamp.setValue(nextFireTimeStamp.getTime());
} catch (IOException | ParseException e) {
e.printStackTrace();
}
}
}
}
}
@Override
public void onQueryTerminated(QueryTerminatedEvent event) {
}
}
这里是负责加载文件进行广播变量更新的地方 :
public class LoadResourceManager implements Serializable {
private volatile Broadcast<byte[]> broadcast;
/**
* 获取广播变量
* @return
*/
public Broadcast<byte[]> getBroadcast(){
return broadcast;
}
/**
* 删除老的广播变量
*/
public void unpersist(){
broadcast.unpersist(true);
}
/**
* 获取下一次触发更新的时间点
* @param crontab
* @return
* @throws ParseException
*/
public Date getNextFireTimeStamp(String crontab) throws ParseException {
CronExpression cron = new CronExpression(crontab);
Date nextFireDate = cron.getNextValidTimeAfter(new Date());
return nextFireDate;
}
/**
* 加载ip库文件
* @param configBean
* @param nextUpdateTimeStamp
* @param curFileModifyTimeStamp
* @throws IOException
* @throws ParseException
*/
public void load(ConfigBean configBean, LongAccumulator nextUpdateTimeStamp,
LongAccumulator curFileModifyTimeStamp) throws IOException, ParseException {
String filepath = configBean.getAppProperties().getProperty("app.ipdim.filepath");
String crontab = configBean.getAppProperties().getProperty("app.ipdim.crondate");
SparkSession spark = configBean.getSparkSession();
Logger logger = spark.log();
// 这里获取上一次的文件修改的时间戳
long lastFileModifyTimeStamp = curFileModifyTimeStamp.value();
// 这里更新下次要更新的时间点
Date nextFireTimeStamp = getNextFireTimeStamp(crontab);
nextUpdateTimeStamp.setValue(nextFireTimeStamp.getTime());
// 这里加载ip文件
byte[] bytes = readHdfsBinaryFile(filepath, curFileModifyTimeStamp);
if(bytes == null && lastFileModifyTimeStamp <= 0){
throw new RuntimeException("firstly load ip file error, the ip file [" + filepath + "] is not exists");
}else if(bytes == null && lastFileModifyTimeStamp > 0){
// 这里执行报警操作
String msg = "reload ip file error, the ip file [" + filepath + "] is not exists";
logger.error(msg);
String type = configBean.getConstantMap().get("process_exception").toArray()[0].toString();
AlarmUtil.sendAlarmMsg(msg, configBean, type);
}else if(bytes != null){
if(lastFileModifyTimeStamp > 0) {
// 先清除该老的广播变量
String msg = DateUtil.sysTime() + " start release old ip database broadcast";
logger.warn(msg);
unpersist();
}
JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
broadcast = jsc.broadcast(bytes);
String msg = DateUtil.sysTime() + " load ip file ["+ filepath +"], " +
"the ip file midifytime is ["+DateUtil.format(new Date(curFileModifyTimeStamp.value())) +
"], the next fire update is ["+DateUtil.format(new Date(nextUpdateTimeStamp.value()))+"]";
logger.warn(msg);
}
}
/**
* 获取到HDFS的连接
* @return
* @throws IOException
*/
private FileSystem getFileSystem() throws IOException {
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
return fileSystem;
}
/**
* 读取二进制ip文件
* @param filePath
* @return
*/
private byte[] readHdfsBinaryFile(String filePath, LongAccumulator curFileModifyTimeStamp) throws IOException {
FSDataInputStream fsDataInputStream = null;
ByteArrayOutputStream byteArrayOutputStream = null;
FileSystem fileSystem = null;
byte[] bytes = null;
try {
fileSystem = getFileSystem();
// 这里访问指定的文件
Path path = new Path(filePath);
FileStatus fileStatus = fileSystem.getFileStatus(path);
// 判断文件是否存在
if(fileStatus == null){
return bytes;
}
// 文件大小
long len = fileStatus.getLen();
// 获取文件修改时间
long modificationTime = fileStatus.getModificationTime();
curFileModifyTimeStamp.setValue(modificationTime);
// 这里读取文件写入到ByteBuffer中
fsDataInputStream = getFileSystem().open(path);
byteArrayOutputStream = new ByteArrayOutputStream();
IOUtils.copyBytes(fsDataInputStream, byteArrayOutputStream, len, false);
bytes = byteArrayOutputStream.toByteArray();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(fsDataInputStream != null){
IOUtils.closeStream(fsDataInputStream);
}
if(byteArrayOutputStream != null){
byteArrayOutputStream.close();
}
if(fileSystem != null){
fileSystem.close();
}
}
return bytes;
}
/**
* 获取文件最近一次的修改时间
* @param filePath
* @return
*/
public long getHdfsLastModify(String filePath) throws IOException {
long lastmodify = 0L;
FileSystem fileSystem = null;
try {
fileSystem = getFileSystem();
// 这里访问指定的文件
Path path = new Path(filePath);
FileStatus fileStatus = fileSystem.getFileStatus(path);
// 文件最后一次修改时间
lastmodify = fileStatus.getModificationTime();
}catch (IOException e) {
e.printStackTrace();
} finally {
if(fileSystem != null){
fileSystem.close();
}
}
return lastmodify;
}
}
初始化广播变量的地方:
// 这里初始化加载ip库文件,并广播出去
LongAccumulator curFileModifyTimeStamp = sparkSession.sparkContext().longAccumulator("curFileModifyTimeStamp");
LongAccumulator nextUpdateTimeStamp = sparkSession.sparkContext().longAccumulator("nextUpdateTimeStamp");
// 启动的时候加载一次
String filepath = configBean.getAppProperties().getProperty("app.ipdim.filepath");
String crontab = configBean.getAppProperties().getProperty("app.ipdim.crondate");
LoadResourceManager loadResourceManager = new LoadResourceManager();
try {
loadResourceManager.load(configBean, nextUpdateTimeStamp, curFileModifyTimeStamp);
} catch (IOException | ParseException e) {
e.printStackTrace();
}
// 将广播变量更新器封装到configbean中
configBean.setLoadResourceManager(loadResourceManager);
sparkSession.streams().addListener(new MyStreamingQueryListener(configBean, nextUpdateTimeStamp,
curFileModifyTimeStamp, loadResourceManager));
Executor端使用广播变量的地方:
// 注意这里将广播变量封装为了ConfigBean的成员变量
byte[] value1 = configBean.getLoadResourceManager().getBroadcast().getValue();
AWReader awReader = SingletonAWReader.getInstance(value1);
InetAddress address = InetAddress.getByName(src_ip);
JsonNode record = awReader.get(address);
注意:
在使用的时候广播变量的时候,我们需要将广播变量封装为一个类的普通的成员变量(比如示例中的LoadResourceManager ),通过方法的形式参数方式传递到我们需要使用的地方。否则如果直接使用为类的静态成员变量的时候,会因为spark的闭包序列化传输的时候只会将该静态变量给引用进来而不会初始化该广播变量,导致在使用的时候广播变量是null值。