package march.sql
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.functions._
/**
* Description: TODO
*
* @Author: 留歌36
* @Date: 2019/3/6 8:57
*/
object ChengduHouseAPP {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[2]").getOrCreate()
// 隐式转换
val path = "f:\\data\\chengdu_house.csv"
val path2 = "f:\\data\\hangzhou_house.csv"
val path3 = "f:\\data\\shanghai_house.csv"
val path4 = "f:\\data\\kunming_house.csv"
val DF = spark.read.option("header","true").option("inferSchema","true").csv(path)
val DF2 = spark.read.option("header","true").option("inferSchema","true").csv(path2)
val DF3 = spark.read.option("header","true").option("inferSchema","true").csv(path3)
val DF4 = spark.read.option("header","true").option("inferSchema","true").csv(path4)
val DF5 = DF
.union(DF2.select("house_info","region","publishday","visited","attention","total_price","unit_price","url"))
.union(DF3.select("house_info","region","publishday","visited","attention","total_price","unit_price","url"))
.union(DF4.select("house_info","region","publishday","visited","attention","total_price","unit_price","url"))
println("chengdu_houseDF:"+DF.count())
println("hangzhou_houseDF2:"+DF2.count())
println("shanghai_houseDF3:"+DF3.count())
println("kunming_houseDF4:"+DF4.count())
println("AllDF5:"+DF5.count())
val DF6 = DF5
.drop("url")
.drop("publishday")
.filter(DF.col("region").isNotNull)
.filter(DF.col("unit_price").isNotNull)
// .filter(DF.col("visited").isNotNull)
// .filter(DF.col("attention").isNotNull)
// .filter(size(split(col("house_info") ,"\\|")) === 6)
.withColumn("rooms", split(col("house_info"), "\\|").getItem(1).substr(2,1))
.withColumn("halls", split(col("house_info"), "\\|").getItem(1).substr(4,1))
.withColumn("towards", split(col("house_info"), "\\|").getItem(3))
.withColumn("area", split(col("house_info"), "\\|").getItem(2))
.withColumn("decoration", split(col("house_info"), "\\|").getItem(4))
.withColumn("have_elevator", split(col("house_info"), "\\|").getItem(5))
.drop("house_info")
val DF7 = DF6.select("region","rooms","halls","towards","decoration","have_elevator","visited"
,"attention","unit_price","area","total_price"
)
DF7.show(10,false)
println(DF7.count())
DF7.coalesce(1).write.option("header", "true").mode(SaveMode.Overwrite).csv("f:\\data\\logout\\")
spark.stop()
}
// private def NoNull(: SparkSession) = {
// AllDF.col("").isNotNull
// }
}
csv数据文件清洗【DataFrame】
最新推荐文章于 2024-05-24 00:48:06 发布