需求:需要通过spark对redis里面的数据进行实时读写
实现方案:通过建立连接池,在每台机器上单独建立连接,进行操作
1、利用lazy val的方式进行包装
class RedisSink(makeJedisPool: () => JedisPool) extends Serializable {
lazy val pool = makeJedisPool()
}
object RedisSink {
def apply(redisHost: String, redisPort: Int, password: String, database: Int): RedisSink = {
val createJedisPoolFunc = () => {
val poolConfig = new GenericObjectPoolConfig()
val pool = new JedisPool(poolConfig, redisHost, redisPort, Protocol.DEFAULT_TIMEOUT, password, database)
val hook = new Thread {
override def run = {
pool.destroy()
}
}
sys.addShutdownHook(hook.run)
pool
}
new RedisSink(createJedisPoolFunc)
}
}
2、使用时,我们利用广播变量的形式,将RedisSink广播到每一个executor
val redisSink: Broadcast[RedisSink] = {
sc.broadcast(RedisSink(redisHost, redisPort, redisPassword, redisDatabase))
}
val rdd = DataUtils.getKafkaDataSource(ssc, topic, bootstrap, groupId)
rdd.foreachRDD(recordRDD => {
if (!recordRDD.isEmpty()) {
recordRDD.foreachPartition(part => {
val jedisPool = redisSink.value.pool
val jedis = jedisPool.getResource
val pipeline = jedis.pipelined()
part.foreach(x => {
val message = x.value()
val kv = message.split(",")
val key = kv(0)
val value = kv(1)
// jedis.set(key, value)
pipeline.set(key, value)
})
pipeline.sync()
jedis.close()
})
}
})