使用spark的rdd读取redis写入maxcompute/hive
引入maven
<dependency>
<groupId>com.redislabs</groupId>
<artifactId>spark-redis</artifactId>
<version>2.4.1</version>
</dependency>
spark代码
import com.redislabs.provider.redis._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession, redis}
import org.apache.spark.storage.StorageLevel
object Redis2MaxCompute {
def main(args: Array[String]): Unit = {
println("Redis2MaxCompute just start ")
val spark = SparkSession
.builder()
.appName("Redis2MaxCompute " )
.config("spark.sql.broadcastTimeout", 20 * 60)
.config("spark.sql.crossJoin.enabled", true)
.config("odps.exec.dynamic.partition.mode", "nonstrict")
.config("spark.sql.catalogImplementation", "odps")
.config("spark.redis.host", "redis地址")
.config("spark.redis.port", "redis端口号码")
.config("spark.redis.auth", "redis密码") //指定redis密码
.config("spark.redis.db", "指定redis库") //指定redis库
.getOrCreate()
val sc = spark.sparkContext
val readWriteConf = ReadWriteConfig(scanCount = 1000, maxPipelineSize = 1000)
//todo 获取keys的RDD
println("retrieve keysRDD start ")
val keysRDD = sc.fromRedisKeyPattern("0*", 3000)(readWriteConfig = readWriteConf).persist(StorageLevel.MEMORY_AND_DISK_SER_2)
println("retrieve keysRDD number is: "+keysRDD.count())
println("sampling: ")
keysRDD.top(10).foreach(println(_))
println("retrieve stringRDD start ")
val stringRDD = sc.fromRedisKV(keysRDD,3000).persist(StorageLevel.MEMORY_AND_DISK_SER_2)
println("retrieve stringRDD number is: "+stringRDD.count())
println("sampling: ")
stringRDD.top(10).foreach(kv=>{
val key = kv._1
val value = kv._2
println("sampling...")
println("key: "+key)
println("value: "+value)
})
//todo 获取keys-values的RDD
val stringRowRDD = stringRDD.map(item=>{
val key = item._1
val value = item._2
Row.fromSeq(Array(key,value))
})
println("backupToMaxCompute...")
}
}