spark批量写入redis

最新推荐文章于 2024-07-21 13:46:21 发布

九指码农

最新推荐文章于 2024-07-21 13:46:21 发布

阅读量3k

点赞数

分类专栏： spark及问题解决大数据文章标签： spark 批量 redis

本文链接：https://blog.csdn.net/qq_14950717/article/details/103250436

版权

spark及问题解决同时被 2 个专栏收录

36 篇文章 1 订阅

订阅专栏

大数据

24 篇文章 1 订阅

订阅专栏

最近工作中，在融合数据的时候，需要将10亿+的记录push到redis中，运维的同学帮忙搭建好redis集群，100主 + 100 从（单节点8G），最开始打算第一次批量写入使用spark去写入到redis，因为数据存放在Hive表。
一、相关依赖的jar包

compile group: 'com.redislabs', name: 'spark-redis', version: '2.3.0'
compile group: 'redis.clients', name: 'jedis', version: '2.9.0'
compile group: 'org.apache.commons', name: 'commons-pool2', version: '2.0'

我用gradle管理依赖，如果用maven也可以去maven官网寻找。

二、测试用例
1、方法

sc.toRedisKV() 存储key、value字符串
具体实现：
def toRedisKV(kvs: RDD[(String, String)], ttl: Int = 0)
               (implicit redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(sc.getConf))) {
    kvs.foreachPartition(partition => setKVs(partition, ttl, redisConfig))
  }
 /**
    * @param arr k/vs which should be saved in the target host
    *            save all the k/vs to the target host
    * @param ttl time to live
    */
  def setKVs(arr: Iterator[(String, String)], ttl: Int, redisConfig: RedisConfig) {
    arr.map(kv => (redisConfig.getHost(kv._1), kv)).toArray.groupBy(_._1).
      mapValues(a => a.map(p => p._2)).foreach {
      x => {
        val conn = x._1.endpoint.connect()
        val pipeline = conn.pipelined
        if (ttl <= 0) {
          x._2.foreach(x => pipeline.set(x._1, x._2))
        }
        else {
          x._2.foreach(x => pipeline.setex(x._1, ttl, x._2))
        }
        pipeline.sync
        conn.close
      }
    }
  }


sc.toRedisHASH() 存储hash map
/**
    * @param kvs      Pair RDD of K/V
    * @param hashName target hash's name which hold all the kvs
    * @param ttl time to live
    */
  def toRedisHASH(kvs: RDD[(String, String)], hashName: String, ttl: Int = 0)
                 (implicit redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(sc.getConf))) {
    kvs.foreachPartition(partition => setHash(hashName, partition, ttl, redisConfig))
  }
 /**
    * @param hashName
    * @param arr k/vs which should be saved in the target host
    *            save all the k/vs to hashName(list type) to the target host
    * @param ttl time to live
    */
  def setHash(hashName: String, arr: Iterator[(String, String)], ttl: Int, redisConfig: RedisConfig) {
    val conn = redisConfig.connectionForKey(hashName)
    val pipeline = conn.pipelined
    arr.foreach(x => pipeline.hset(hashName, x._1, x._2))
    if (ttl > 0) pipeline.expire(hashName, ttl)
    pipeline.sync
    conn.close
  }
// 后续再补充
sc.toRedisFixedLIST()
sc.toRedisLIST()
sc.toRedisSET()
sc.toRedisZSET()

2、测试

val activeRiskTableName = "hm_service_risk.test_active_risk_base_db"
    val today = DateUtil.format(DateUtil.getDayBegin, "yyyyMMdd")
    val dataFrame = spark.read.table(activeRiskTableName).filter($"stat_date".equalTo(today)).
      select("id", "update_time", "risk").
      withColumn("update_time", to_timestamp($"update_time", "yyyy-MM-dd HH:mm:ss"))

    val redisHost = "localhost"
    val redisPort: Int = 6379
    val redisAuth: String = ""
    val redisDataRdd = dataFrame.map(row => {
      val id = row.getAs[String]("id")
      val updateTime = row.getAs[Int]("update_time")
      val risk = row.getAs[Int]("risk")
      val redisKey = prefix + AESUtils.aesEncrypt(id)
      val json = new JSONObject()
      json.put("update_time", updateTime)
      json.put("risk", risk)
      (redisKey, json.toJSONString)
    }).rdd

    val redisConfig = new RedisConfig(RedisEndpoint(redisHost, redisPort, redisAuth))
    sc.toRedisKV(redisDataRdd)(redisConfig)

临时工作没深入研究，后续有时间深入研究，再补充