1.线上spark程序写数据到redis 每天离线执行一次跑完一次要3个小时
参数:
--conf
spark.executor.instances=10
--conf
spark.executor.cores=2
--conf
spark.executor.memory=4g
代码:
package com.kaishu.spark
import org.apache.spark.sql.SparkSession
import redis.clients.jedis.Jedis
object spark2redis_ads_service_personal_information_list_result_ed {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("spark2redis_ads_service_personal_information_list_result_ed")
.config("spark.sql.broadcastTimeout", 20 * 60)
.config("spark.sql.crossJoin.enabled", true)
.config("odps.exec.dynamic.partition.mode", "nonstrict")
.getOrCreate()
import spark._
import sqlContext.implicits._
val result = spark.sql(
s"""
|SELECT o1.device_id
| ,cast(o1.uuid as string) as uuid
| ,o1.ip
| ,o1.device_model
| ,o1.network_type
| ,o1.operator
|FROM (
| SELECT device_id
| ,uuid
| ,ip
| ,device_model
| ,network_type
| ,operator
| FROM kaishu_bigdata.ads_service_personal_information_list_result_ed
| WHERE dt = DATE_SUB(current_date(),1)
| ) o1
|LEFT JOIN (
| SELECT device_id
| ,uuid
| ,ip
| ,device_model
| ,network_type
| ,operator
| FROM kaishu_bigdata.ads_service_personal_information_list_result_ed
| WHERE dt = DATE_SUB(current_date(),2)
| ) o2
|ON o1.device_id = o2.device_id
|AND o1.uuid = o2.uuid
|AND o1.ip = o2.ip
|AND o1.device_model = o2.device_model
|AND o1.network_type = o2.network_type
|AND o1.operator = o2.operator
|WHERE o2.device_id IS NULL
|
|""".stripMargin)
result.repartition(20).foreachPartition({
rdd => {
val jedis_rdd = new Jedis("redis-xxx.myhuaweicloud.com", 6379)
jedis_rdd.auth("xxx")
rdd.foreach{
data =>
var device_id = data.getString(0)
var uuid = data.getString(1)
var ip = data.getString(2)
var device_model = data.getString(3)
var network_type = data.getString(4)
var operator = data.getString(5)
try {
jedis_rdd.set("personal_information_list:device_id:" + device_id,
"{\"uuid\":" + uuid + ",\"ip\":[\"" + ip + "\"]" + ",\"device_model\":\"" + device_model + "\",\"network_type\":\"" + network_type + "\",\"operator\":\"" + operator + "\"}")
}
catch {
case e: Exception => {
print(e)
print(device_id, ",", uuid, ",", ip, ",", device_model, ",", network_type, ",", operator)
}
}
}
jedis_rdd.close()
}
}
)
}
}
查看yarn运行计划,是写数据到redis这个阶段特别慢,一共是10个executer,总共是20个task
优化思路就是增加并行写的能力,修改代码中充分区数为400,repartition(400),设置executer数20个,task数为400,12分钟跑完了
看下日志