最近工作中,在融合数据的时候,需要将10亿+的记录push到redis中,运维的同学帮忙搭建好redis集群,100主 + 100 从 (单节点8G),最开始打算第一次批量写入使用spark去写入到redis,因为数据存放在Hive表。
一、相关依赖的jar包
compile group: 'com.redislabs', name: 'spark-redis', version: '2.3.0'
compile group: 'redis.clients', name: 'jedis', version: '2.9.0'
compile group: 'org.apache.commons', name: 'commons-pool2', version: '2.0'
我用gradle管理依赖,如果用maven也可以去maven官网寻找。
二、测试用例
1、方法
sc.toRedisKV() 存储key、value字符串
具体实现:
def toRedisKV(kvs: RDD[(String, String)], ttl: Int = 0)
(implicit redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(sc.getConf))) {
kvs.foreachPartition(partition => setKVs(partition, ttl, redisConfig))
}
/**
* @param arr k/vs which should be saved in the target host
* save all the k/vs to the target host
* @param ttl time to live
*/
def setKVs(arr: Iterator[(String, String)], ttl: Int, redisConfig: RedisConfig) {
arr.map(kv => (redisConfig.getHost(kv._1), kv)).toArray.groupBy(_._1).
mapValues(a => a.map(p => p._2)).foreach {
x => {
val conn = x._1.endpoint.connect()
val pipeline = conn.pipelined
if (ttl <= 0) {
x._2.foreach(x => pipeline.set(x._1, x._2))
}
else {
x._2.foreach(x => pipeline.setex(x._1, ttl, x._2))
}
pipeline.sync
conn.close
}
}
}
sc.toRedisHASH() 存储hash map
/**
* @param kvs Pair RDD of K/V
* @param hashName target hash's name which hold all the kvs
* @param ttl time to live
*/
def toRedisHASH(kvs: RDD[(String, String)], hashName: String, ttl: Int = 0)
(implicit redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(sc.getConf))) {
kvs.foreachPartition(partition => setHash(hashName, partition, ttl, redisConfig))
}
/**
* @param hashName
* @param arr k/vs which should be saved in the target host
* save all the k/vs to hashName(list type) to the target host
* @param ttl time to live
*/
def setHash(hashName: String, arr: Iterator[(String, String)], ttl: Int, redisConfig: RedisConfig) {
val conn = redisConfig.connectionForKey(hashName)
val pipeline = conn.pipelined
arr.foreach(x => pipeline.hset(hashName, x._1, x._2))
if (ttl > 0) pipeline.expire(hashName, ttl)
pipeline.sync
conn.close
}
// 后续再补充
sc.toRedisFixedLIST()
sc.toRedisLIST()
sc.toRedisSET()
sc.toRedisZSET()
2、测试
val activeRiskTableName = "hm_service_risk.test_active_risk_base_db"
val today = DateUtil.format(DateUtil.getDayBegin, "yyyyMMdd")
val dataFrame = spark.read.table(activeRiskTableName).filter($"stat_date".equalTo(today)).
select("id", "update_time", "risk").
withColumn("update_time", to_timestamp($"update_time", "yyyy-MM-dd HH:mm:ss"))
val redisHost = "localhost"
val redisPort: Int = 6379
val redisAuth: String = ""
val redisDataRdd = dataFrame.map(row => {
val id = row.getAs[String]("id")
val updateTime = row.getAs[Int]("update_time")
val risk = row.getAs[Int]("risk")
val redisKey = prefix + AESUtils.aesEncrypt(id)
val json = new JSONObject()
json.put("update_time", updateTime)
json.put("risk", risk)
(redisKey, json.toJSONString)
}).rdd
val redisConfig = new RedisConfig(RedisEndpoint(redisHost, redisPort, redisAuth))
sc.toRedisKV(redisDataRdd)(redisConfig)
临时工作没深入研究,后续有时间深入研究,再补充