在建模时需要对数据进行缺失值比例统计,网上的方法大部分都是使用map完成,这样效率较低。分享使用sql完成比例统计的方法。
object MyTest{
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.set("spark.driver.host", "localhost")
.setMaster("local[*]")
.setAppName("outlier")val spark = SparkSession.builder.config(conf).getOrCreate()
import spark.implicits._
val df = spark.read.format("csv").option("sep", ",").option("inferSchema", "true")
.option("header", "true").load("/opt/temp/iris4minority.csv")# 创建临时表
val tmpTable = UUID.randomUUID().toString.replace("-","")
df.createTempView(tmpTable)
var columns: Array[String] = df.columns
var sqlBuffer = new StringBuffer()# 拼接sql
sqlBuffer.append("select ")
val columnNum = df.count()
for(col <- columns){
sqlBuffer.append(s"count(${col} is null or null)/${columnNum} as ${col} ,")
}
sqlBuffer = sqlBuffer.deleteCharAt(sqlBuffer.length()-1)
sqlBuffer.append(" from ").append(tmpTable)
println(sqlBuffer.toString)
var frame: DataFrame = spark.sql(sqlBuffer.toString)
frame.show()
}
}