spark 宽表 mysql_spark生成大宽表的parquet性能优化（转载）

最新推荐文章于 2023-08-17 15:12:45 发布

陈学家

最新推荐文章于 2023-08-17 15:12:45 发布

阅读量267

点赞数

文章标签： spark 宽表 mysql

本文链接：https://blog.csdn.net/weixin_33459498/article/details/113682730

版权

此篇博客介绍了如何使用Spark SQL处理用户行为数据，包括读取资源文件、解析用户属性、动态生成DataFrame Schema，并以Parquet格式存储。重点在于构建用户标签和配置Spark配置参数。

摘要由CSDN通过智能技术生成

private def CTRL_A = ‘\001‘

private def CTRL_B = ‘\002‘

private def CTRL_C = ‘\003‘def main(args: Array[String]): Unit={

val resourcePath= this.getClass.getResource("/resource.txt").getFile

val sourcePath= this.getClass.getResource("/*.gz").getFile

val output= "/home/dev/output"val conf= new SparkConf().setAppName("user test").setMaster("local")

val sc= newSparkContext(conf)

val sqlContext= newSQLContext(sc)

sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")

sqlContext.setConf("spark.sql.inMemoryColumnarStorage.compressed", "true")

sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")

val map: Map[String, String]=buildResource(resourcePath)

val schema=buildSchema(map)

val bd=sc.broadcast(map)

val bdSchema=sc.broadcast(schema)

val start=System.currentTimeMillis()

val rdd=sc.textFile(sourcePath)

.map(line=>{

val map=buildUser(line, bd.value)

buildRow(map._3, map._1, map._2)

})//rdd.foreach(_=>())//sqlContext.createDataFrame(rdd, bdSchema.value).write.mode(SaveMode.Overwrite).json(output)

sqlContext.createDataFrame(rdd, bdSchema.value).write.mode(SaveMode.Overwrite).parquet(output)

val end=System.currentTimeMillis()

System.out.print(end -start)

}/**

* 读取资源文件

* @param file

* @return*/def buildResource(file: String): Map[String, String]={

val reader=Source.fromFile(file)

val map= newmutable.HashMap[String, String]()for (line

val arr= StringUtils.splitPreserveAllTokens(line, ‘\t‘)

map.+=((arr(0), "0"))

}

map.toMap

}/**

* 生成用户属性

* @param line

* @param map

* @return*/def buildUser(line: String, map: Map[String, String]): (String, Int, Map[String, String])={if(Strings.isNullOrEmpty(line)) {return ("", 0, Map.empty)

}

val array=StringUtils.splitPreserveAllTokens(line, CTRL_A)

val cookie= if (Strings.isNullOrEmpty(array(0))) "-" else array(0)

val platform= array(1).toInt

valbase = buildFeature(array(2))

val interest= buildFeature(array(3))

val buy= buildFeature(array(4))

val features= base ++ interest ++buy

val result= newmutable.HashMap[String, String]()for (pair

val value= if (features.contains(pair._1)) "1" else "0"result.+=((pair._1, value))

}

(cookie, platform, result.toMap)

}/**

* 抽取用户标签

* @param expr

* @return*/def buildFeature(expr: String): Array[String]={if(Strings.isNullOrEmpty(expr)) {returnArray.empty

}

val arr=StringUtils.splitPreserveAllTokens(expr, CTRL_B)

val buffer= newArrayBuffer[String]()for (key

val pair=StringUtils.splitPreserveAllTokens(key, CTRL_C)

buffer+= (s"_${pair(0)}")

}

buffer.toArray

}/**

* 动态生成DataFrame的Schema

* @param map

* @return*/def buildSchema(map: Map[String, String]): StructType={

val buffer= newArrayBuffer[StructField]()

buffer+= (StructField("user", StringType, false))

buffer+= (StructField("platform", IntegerType, false))for (pair

buffer+= (StructField(s"_${pair._1}", IntegerType, true))

}return StructType(List(buffer: _*))

}/**

* 将用户属性构造成Spark SQL的Row

* @param map

* @param user

* @param platform

* @return*/def buildRow(map: Map[String, String], user: String, platform: Int): Row={

val buffer= newArrayBuffer[Any]()

buffer+=(user)

buffer+=(platform)for (pair

buffer+=(pair._2.toInt)

}return Row(buffer: _*)

}

陈学家

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫