上一节实践DSP广告数据平台的日志文件转换成parquet文件第二种方案
现在主要对该工程进行代码封装优化,在将来如果使用的是spark 1.6+可是用样例类来封装数据,如下优化代码主要有:
代码封装优化
1、首先,编写封装创建sparkContext实例
dolphin-doit01\src\main\scala\cn\sheep\dolphin\common\DolphinAppComm.scala
package cn.sheep.dolphin.common
import cn.sheep.dolphin.config.DolphinConfig
import org.apache.spark.{SparkConf, SparkContext}
import cn.sheep.dolphin.common.DolphinEnum._
/**
* author: old sheep
* Created 2021/04/03 11:24
*/
object DolphinAppComm {
/**
* 封装创建sparkContext实例
*
* @param appName
* @param params
* @return
*/
def createSparkContext(appName: String, params: Map[String, String] = Map.empty) = {
val sparkConf = new SparkConf()
sparkConf.setAppName(appName)
if (DolphinConfig._power.equals(local.toString)) { // 运行模式的开关
sparkConf.setMaster("local[*]")
}
// 封装用户传递进来的参数
params.foreach {
case (key, value) => sparkConf.set(key, value)
}
new SparkContext(sparkConf)
}
}
2、配置全局文件
dolphin-doit01\src\main\resources\application.conf
# 设置一个程序运行时的开关,控制成是否是本地运行还是集群运行模式(cluster: 集群|local:本机)
dolphin.common.power=local
# mysql的配置信息
dolphin.mysql.driver=com.mysql.jdbc.Driver
dolphin.mysql.url="jdbc:mysql:///spark?characterEncoding=utf-8"
dolphin.mysql.username=root
dolphin.mysql.password=123456
3、定义枚举类
dolphin-doit01\src\main\scala\cn\sheep\dolphin\common\DolphinEnum.scala
package cn.sheep.dolphin.common
/**
* author: old sheep
* Created 2021/04/03 11:31
*/
object DolphinEnum extends Enumeration{
type DolphinEnum = Value
val local, cluster = Value
}
4、编写解析application.conf配置文件
dolphin-doit01\src\main\scala\cn\sheep\dolphin\config\DolphinConfig.scala
package cn.sheep.dolphin.config
import java.util.Properties
import com.typesafe.config.{Config, ConfigFactory}
/**
* 解析application.conf配置文件
* author: old sheep
* Created 2021/04/03 11:29
*/
object DolphinConfig {
private lazy val application = ConfigFactory.load()
/* 解析运行模式 */
val _power = application.getString("dolphin.common.power")
/* 解析mysql配置 */
val _driver = application.getString("dolphin.mysql.driver")
val _url = application.getString("dolphin.mysql.url")
val _username = application.getString("dolphin.mysql.username")
val _password = application.getString("dolphin.mysql.password")
/* mysql的属性参数封装 */
val props = new Properties()
props.setProperty("driver", _driver)
props.setProperty("user", _username)
props.setProperty("password", _password)
}
优化Bz2Parquet类为:
//读取离线的数据文件的
// val sc = new SparkContext("")
val sc = DolphinAppComm.createSparkContext("将bz2日志文件转换成parquet文件")
// 读取离线的bz2日志文件
val data = sc.textFile(bz2InputPath)
// 过滤非法数据
val filteredRDD: RDD[Array[String]] = data.map(_.split(",", -1)).filter(_.size >= 85)
// parquet <- DataFrame(几种创建方式) <- SQLContext <- RDD
val sqlc = new SQLContext(sc)
优化Bz2ParquetV2类为:
// spark序列化优化参数
val sparkParams = Map[String, String](
"spark.serializer" -> classOf[KryoSerializer].getName,
"spark.sql.parquet.compression.codec" -> "snappy" // 设置parquet文件的压缩格式
)
val sc = DolphinAppComm.createSparkContext("将bz2日志文件转换成parquet文件",sparkParams)
parquet文件分区操作
在Bz2ParquetV2类,进行对partitionBy 对数据按照省份进行分区设置:
// partitionBy 对数据按照省份进行分区
dataFrame.write.partitionBy("provincename").parquet(parquetOutPath)
运行Bz2ParquetV2类程序,控制台打印输出日志信息:
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=江西省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=河北省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=河南省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=浙江省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=海南省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=宁夏回族自治区
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=湖北省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=安徽省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=山东省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=山西省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=福建省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=广东省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=西藏自治区
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=湖南省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=贵州省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=辽宁省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=上海市
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=重庆市
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=云南省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=陕西省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=青海省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=香港特别行政区
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=内蒙古自治区
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=北京市
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=台湾省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=吉林省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=四川省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=天津市
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=澳门特别行政区
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=广西壮族自治区
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=新疆维吾尔自治区
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=甘肃省
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=未知
21/04/03 12:30:46 INFO HadoopFsRelation: Listing file:/C:/Users/mrman/dolphin-doit01/parquet/provincename=黑龙江省
在自己的磁盘文件里可以查看