spark 宽表 mysql_spark生成大宽表的parquet性能优化（转载）

最新推荐文章于 2021-02-11 10:11:56 发布

weixin_39963465

最新推荐文章于 2021-02-11 10:11:56 发布

阅读量163

点赞数

文章标签： spark 宽表 mysql

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/weixin_39963465/article/details/113682728

版权

private def CTRL_A = '\001'

private def CTRL_B = '\002'

private def CTRL_C = '\003'def main(args: Array[String]): Unit={

val resourcePath= this.getClass.getResource("/resource.txt").getFile

val sourcePath= this.getClass.getResource("/*.gz").getFile

val output= "/home/dev/output"val conf= new SparkConf().setAppName("user test").setMaster("local")

val sc= newSparkContext(conf)

val sqlContext= newSQLContext(sc)

sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")

sqlContext.setConf("spark.sql.inMemoryColumnarStorage.compressed", "true")

sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")

val map: Map[String, String]=buildResource(resourcePath)

val schema=buildSchema(map)

val bd=sc.broadcast(map)

val bdSchema=sc.broadcast(schema)

val start=System.currentTimeMillis()

val rdd=sc.textFile(sourcePath)

.map(line=>{

val map=buildUser(line, bd.value)

buildRow(map._3, map._1, map._2)

})//rdd.foreach(_=>())//sqlContext.createDataFrame(rdd, bdSchema.value).write.mode(SaveMode.Overwrite).json(output)

sqlContext.createDataFrame(rdd, bdSchema.value).write.mode(SaveMode.Overwrite).parquet(output)

val end=System.currentTimeMillis()

System.out.print(end -start)

}/**

* 读取资源文件

* @param file

* @return*/def buildResource(file: String): Map[String, String]={

val reader=Source.fromFile(file)

val map= newmutable.HashMap[String, String]()for (line

val arr= StringUtils.splitPreserveAllTokens(line, '\t')

map.+=((arr(0), "0"))

}

map.toMap

}/**

* 生成用户属性

* @param line

* @param map

* @return*/def buildUser(line: String, map: Map[String, String]): (String, Int, Map[String, String])={if(Strings.isNullOrEmpty(line)) {return ("", 0, Map.empty)

}

val array=StringUtils.splitPreserveAllTokens(line, CTRL_A)

val cookie= if (Strings.isNullOrEmpty(array(0))) "-" else array(0)

val platform= array(1).toInt

valbase = buildFeature(array(2))

val interest= buildFeature(array(3))

val buy= buildFeature(array(4))

val features= base ++ interest ++buy

val result= newmutable.HashMap[String, String]()for (pair

val value= if (features.contains(pair._1)) "1" else "0"result.+=((pair._1, value))

}

(cookie, platform, result.toMap)

}/**

* 抽取用户标签

* @param expr

* @return*/def buildFeature(expr: String): Array[String]={if(Strings.isNullOrEmpty(expr)) {returnArray.empty

}

val arr=StringUtils.splitPreserveAllTokens(expr, CTRL_B)

val buffer= newArrayBuffer[String]()for (key

val pair=StringUtils.splitPreserveAllTokens(key, CTRL_C)

buffer+= (s"_${pair(0)}")

}

buffer.toArray

}/**

* 动态生成DataFrame的Schema

* @param map

* @return*/def buildSchema(map: Map[String, String]): StructType={

val buffer= newArrayBuffer[StructField]()

buffer+= (StructField("user", StringType, false))

buffer+= (StructField("platform", IntegerType, false))for (pair

buffer+= (StructField(s"_${pair._1}", IntegerType, true))

}return StructType(List(buffer: _*))

}/**

* 将用户属性构造成Spark SQL的Row

* @param map

* @param user

* @param platform

* @return*/def buildRow(map: Map[String, String], user: String, platform: Int): Row={

val buffer= newArrayBuffer[Any]()

buffer+=(user)

buffer+=(platform)for (pair

buffer+=(pair._2.toInt)

}return Row(buffer: _*)

}

weixin_39963465

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。