private def CTRL_A = '\001'
private def CTRL_B = '\002'
private def CTRL_C = '\003'def main(args: Array[String]): Unit={
val resourcePath= this.getClass.getResource("/resource.txt").getFile
val sourcePath= this.getClass.getResource("/*.gz").getFile
val output= "/home/dev/output"val conf= new SparkConf().setAppName("user test").setMaster("local")
val sc= newSparkContext(conf)
val sqlContext= newSQLContext(sc)
sqlContext.setConf("spark.sql.parquet.binaryAsString", "true")
sqlContext.setConf("spark.sql.inMemoryColumnarStorage.compressed", "true")
sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")
val map: Map[String, String]=buildResource(resourcePath)
val schema=buildSchema(map)
val bd=sc.broadcast(map)
val bdSchema=sc.broadcast(schema)
val start=System.currentTimeMillis()
val rdd=sc.textFile(sourcePath)
.map(line=>{
val map=buildUser(line, bd.value)
buildRow(map._3, map._1, map._2)
})//rdd.foreach(_=>())//sqlContext.createDataFrame(rdd, bdSchema.value).write.mode(SaveMode.Overwrite).json(output)
sqlContext.createDataFrame(rdd, bdSchema.value).write.mode(SaveMode.Overwrite).parquet(output)
val end=System.currentTimeMillis()
System.out.print(end -start)
}/**
* 读取资源文件
* @param file
* @return*/def buildResource(file: String): Map[String, String]={
val reader=Source.fromFile(file)
val map= newmutable.HashMap[String, String]()for (line
val arr= StringUtils.splitPreserveAllTokens(line, '\t')
map.+=((arr(0), "0"))
}
map.toMap
}/**
* 生成用户属性
* @param line
* @param map
* @return*/def buildUser(line: String, map: Map[String, String]): (String, Int, Map[String, String])={if(Strings.isNullOrEmpty(line)) {return ("", 0, Map.empty)
}
val array=StringUtils.splitPreserveAllTokens(line, CTRL_A)
val cookie= if (Strings.isNullOrEmpty(array(0))) "-" else array(0)
val platform= array(1).toInt
valbase = buildFeature(array(2))
val interest= buildFeature(array(3))
val buy= buildFeature(array(4))
val features= base ++ interest ++buy
val result= newmutable.HashMap[String, String]()for (pair
val value= if (features.contains(pair._1)) "1" else "0"result.+=((pair._1, value))
}
(cookie, platform, result.toMap)
}/**
* 抽取用户标签
* @param expr
* @return*/def buildFeature(expr: String): Array[String]={if(Strings.isNullOrEmpty(expr)) {returnArray.empty
}
val arr=StringUtils.splitPreserveAllTokens(expr, CTRL_B)
val buffer= newArrayBuffer[String]()for (key
val pair=StringUtils.splitPreserveAllTokens(key, CTRL_C)
buffer+= (s"_${pair(0)}")
}
buffer.toArray
}/**
* 动态生成DataFrame的Schema
* @param map
* @return*/def buildSchema(map: Map[String, String]): StructType={
val buffer= newArrayBuffer[StructField]()
buffer+= (StructField("user", StringType, false))
buffer+= (StructField("platform", IntegerType, false))for (pair
buffer+= (StructField(s"_${pair._1}", IntegerType, true))
}return StructType(List(buffer: _*))
}/**
* 将用户属性构造成Spark SQL的Row
* @param map
* @param user
* @param platform
* @return*/def buildRow(map: Map[String, String], user: String, platform: Int): Row={
val buffer= newArrayBuffer[Any]()
buffer+=(user)
buffer+=(platform)for (pair
buffer+=(pair._2.toInt)
}return Row(buffer: _*)
}