spark 宽表 mysql_spark生成大宽表的parquet性能优化(转载)

此篇博客介绍了如何使用Spark SQL处理用户行为数据,包括读取资源文件、解析用户属性、动态生成DataFrame Schema,并以Parquet格式存储。重点在于构建用户标签和配置Spark配置参数。
摘要由CSDN通过智能技术生成

private def CTRL_A = ‘\001‘

private def CTRL_B = ‘\002‘

private def CTRL_C = ‘\003‘def main(args: Array[String]): Unit={

val resourcePath= this.getClass.getResource("/resource.txt").getFile

val sourcePath= this.getClass.getResource("/*.gz").getFile

val output= "/home/dev/output"val conf= new SparkConf().setAppName("user test").setMaster("local")

val sc= newSparkContext(conf)

val sqlContext= newSQLContext(sc)

sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")

sqlContext.setConf("spark.sql.inMemoryColumnarStorage.compressed", "true")

sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")

val map: Map[String, String]=buildResource(resourcePath)

val schema=buildSchema(map)

val bd=sc.broadcast(map)

val bdSchema=sc.broadcast(schema)

val start=System.currentTimeMillis()

val rdd=sc.textFile(sourcePath)

.map(line=>{

val map=buildUser(line, bd.value)

buildRow(map._3, map._1, map._2)

})//rdd.foreach(_=>())//sqlContext.createDataFrame(rdd, bdSchema.value).write.mode(SaveMode.Overwrite).json(output)

sqlContext.createDataFrame(rdd, bdSchema.value).write.mode(SaveMode.Overwrite).parquet(output)

val end=System.currentTimeMillis()

System.out.print(end -start)

}/**

* 读取资源文件

* @param file

* @return*/def buildResource(file: String): Map[String, String]={

val reader=Source.fromFile(file)

val map= newmutable.HashMap[String, String]()for (line

val arr= StringUtils.splitPreserveAllTokens(line, ‘\t‘)

map.+=((arr(0), "0"))

}

map.toMap

}/**

* 生成用户属性

* @param line

* @param map

* @return*/def buildUser(line: String, map: Map[String, String]): (String, Int, Map[String, String])={if(Strings.isNullOrEmpty(line)) {return ("", 0, Map.empty)

}

val array=StringUtils.splitPreserveAllTokens(line, CTRL_A)

val cookie= if (Strings.isNullOrEmpty(array(0))) "-" else array(0)

val platform= array(1).toInt

valbase = buildFeature(array(2))

val interest= buildFeature(array(3))

val buy= buildFeature(array(4))

val features= base ++ interest ++buy

val result= newmutable.HashMap[String, String]()for (pair

val value= if (features.contains(pair._1)) "1" else "0"result.+=((pair._1, value))

}

(cookie, platform, result.toMap)

}/**

* 抽取用户标签

* @param expr

* @return*/def buildFeature(expr: String): Array[String]={if(Strings.isNullOrEmpty(expr)) {returnArray.empty

}

val arr=StringUtils.splitPreserveAllTokens(expr, CTRL_B)

val buffer= newArrayBuffer[String]()for (key

val pair=StringUtils.splitPreserveAllTokens(key, CTRL_C)

buffer+= (s"_${pair(0)}")

}

buffer.toArray

}/**

* 动态生成DataFrame的Schema

* @param map

* @return*/def buildSchema(map: Map[String, String]): StructType={

val buffer= newArrayBuffer[StructField]()

buffer+= (StructField("user", StringType, false))

buffer+= (StructField("platform", IntegerType, false))for (pair

buffer+= (StructField(s"_${pair._1}", IntegerType, true))

}return StructType(List(buffer: _*))

}/**

* 将用户属性构造成Spark SQL的Row

* @param map

* @param user

* @param platform

* @return*/def buildRow(map: Map[String, String], user: String, platform: Int): Row={

val buffer= newArrayBuffer[Any]()

buffer+=(user)

buffer+=(platform)for (pair

buffer+=(pair._2.toInt)

}return Row(buffer: _*)

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值