Spark数据倾斜处理之添加前缀
package spark.day03
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object _07TestDataSkew {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[*]")
.appName("dataSkew")
.getOrCreate()
import spark.implicits._
val rdd1: RDD[String] = spark.sparkContext.makeRDD(List("a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a", "b,c,d,e,f", "b,b,c,c,d,e,f,g", "a,b,a,c,f"))
val df: DataFrame = rdd1.toDF("line")
df.createTempView("temp")
val sql=
"""
|select word,count(1)
|from
|(
|select explode(split(line,",")) word
|from temp) t1
|group by t1.word
|""".stripMargin
println("------先在单词前面拼接随机数字,比如0,1,2,3-----")
val sql1=
"""
|select concat(floor(rand()*4),"-",word)
|from
|(
|select explode(split(line,",")) word
|from temp) t1
|""".stripMargin
println("----将加上前缀的单词,进行预聚合---")
val sql2=
"""
|select prefix_word,count(1)
|from(
|select concat(floor(rand()*4),"-",word) prefix_word
|from
|(
|select explode(split(line,",")) word
|from temp) t1
|) t2
|group by prefix_word
|""".stripMargin
println("----去掉前缀,进行全局聚合--")
val sql3=
"""
|select substr(prefix_word,instr(prefix_word,"-")+1) w,sum(num)
|from
|(select prefix_word,count(1) num
|from(
|select concat(floor(rand()*4),"-",word) prefix_word
|from(
|select explode(split(line,",")) word
|from temp) t1
|) t2
|group by prefix_word
|) t3
|group by w
|""".stripMargin
spark.sql(sql3).show()
spark.stop()
}
}