方法1 :
val map = sc.textFile("/test.txt").map(line => { val arr = line.split(",") (arr(0), arr(2).toInt) }).distinct
var mapBC = sc.broadcast(map.take(10).toMap)
mapBC.unpersist
mapBC = sc.broadcast(map.take(2).toMap)
思考一:客户端需要使用广播变量 此时广播变量正在执行删除操作 资源获取失败 报错
解决方法:
1、更新操作转换为事务 新的未更新完毕 所有操作需要的资源和数据 按照原来广播的使用
2、系统需要的资源获取不到进行等待 重试次数 保持心跳
方法2:
import java.io.{ObjectInputStream, ObjectOutputStream}
import com.bf.dt.wireless.config.WirelessConfig
import com.bf.dt.wireless.formator.WirelessFormator
import com.bf.dt.wireless.storage.MysqlConnectionPool
import com.bf.dt.wireless.utils.DateUtils
import kafka.serializer.StringDecoder
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.json4s._
import org.slf4j.LoggerFactory
import scala.collection.mutable
object WirelessLogAnalysis {
object BroadcastWrapper {
@volatile private var instance: Broadcast[Map[String, List[String]]] = null
private val map = mutable.LinkedHashMap[String, List[String]]()
def getMysql(): Map[String, List[String]] = {
//1.获取mysql连接池的一个连接
val conn = MysqlConnectionPool.getConnection.get
//2.查询新的数据
val sql = "select aid_type,aids from cf_similarity"
val ps = conn.prepareStatement(sql)
val rs = ps.executeQuery()
while (rs.next()) {
val aid = rs.getString("aid_type")
val aids = rs.getString("aids").split(",").toList
map += (aid -> aids)
}
//3.连接池回收连接
MysqlConnectionPool.closeConnection(conn)
map.toMap
}
def update(sc: SparkContext, blocking: Boolean = false): Unit = {
if (instance != null)
instance.unpersist(blocking)
instance = sc.broadcast(getMysql())
}
def getInstance(sc: SparkContext): Broadcast[Map[String, List[String]]] = {
if (instance == null) {
synchronized {
if (instance == null) {
instance = sc.broadcast(getMysql)
}
}
}
instance
}
private def writeObject(out: ObjectOutputStream): Unit = {
out.writeObject(instance)
}
private def readObject(in: ObjectInputStream): Unit = {
instance = in.readObject().asInstanceOf[Broadcast[Map[String, List[String]]]]
}
}
def main(args: Array[String]): Unit = {
val logger = LoggerFactory.getLogger(this.getClass)
val conf = new SparkConf()
.setAppName("wirelessLogAnalysis")
val ssc = new StreamingContext(conf, Seconds(10))
val kafkaConfig: Map[String, String] = Map(
"metadata.broker.list" -> WirelessConfig.getConf.get.getString("wireless.metadata.broker.list"),
"group.id" -> WirelessConfig.getConf.get.getString("wireless.group.id"),
"zookeeper.connect" -> WirelessConfig.getConf.get.getString("wireless.zookeeper.connect"),
"auto.offset.reset" -> WirelessConfig.getConf.get.getString("wireless.auto.offset.reset")
)
val androidvvTopic = WirelessConfig.getConf.get.getString("wireless.topic1")
val iphonevvToplic = WirelessConfig.getConf.get.getString("wireless.topic2")
val kafkaDStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
kafkaConfig,
Set(androidvvTopic, iphonevvToplic)
)
//原始日志流打印
kafkaDStream.print()
val jsonDstream = kafkaDStream.map(x =>
//解析日志流
WirelessFormator.format(x._2)
)
//解密的日志流打印
jsonDstream.print()
jsonDstream.foreachRDD {
rdd => {
// driver端运行,涉及操作:广播变量的初始化和更新
// 可以自定义更新时间
if ((DateUtils.getNowTime().split(" ")(1) >= "08:00:00") && (DateUtils.getNowTime().split(" ")(1) <= "10:10:00")) {
BroadcastWrapper.update(rdd.sparkContext, true)
println("广播变量更新成功: " + DateUtils.getNowTime())
}
//worker端运行,涉及操作:Dstream数据的处理和Redis更新
rdd.foreachPartition {
partitionRecords =>
//1.获取redis连接,保证每个partition建立一次连接,避免每个记录建立/关闭连接的性能消耗
partitionRecords.foreach(
record => {
//2.处理日志流
val uid = record._1
val aid_type = record._2 + "_" + record._3
if (cf.value.keySet.contains(aid_type)) {
(uid, cf.value.get(aid_type))
println((uid, cf.value.get(aid_type)))
}
else
(uid, "-1")
}
//3.redis更新数据
)
//4.关闭redis连接
}
}
}
ssc.start()
ssc.awaitTermination()
}
}