一、特征工程
- 查看数据的分布
1df.describe().show()
2df.summary().show() //这个显示比describe更全
1 唯一值删除
移除列特征中只有一个值的列
1
2`def UniqueValueRemove(df:DataFrame)={
3
4 val df_ttmp=df.select(df.columns.map(c=>countDistinct(col(c)).alias(c)):_*)
5 val map_dict =df_ttmp.columns.zip(df_ttmp.first.toSeq).toMap
6 val cols =df.columns
7 val filterCol= cols.map(co=>if (map_dict(co)==1) "" else co)
8 df.select(filterCol.filter(v=>v!="").map(x=>col(x)):_*)
9
10
11}
12
13// 优化版
14 def removeSingValue():DataFrame={
15 val colsName=df.columns
16 val colsList=colsName.map{line=>
17 if(df.select(line).distinct.count!=1) line else ""
18 }
19 df.select(colsList.filter(p=>p!="").map(x=>col(x)):_*)
20}
2 删除重复行
1 删除指定列的重复值
newDF=df.dropDuplicates(Seq("salary"))
2 删除重复行(如有两行的所有字段全部相同,则删除一行)
newDF=df.dropDuplicates()
3 条件过滤某些异常值
3.1 filter
如 filter(“age>=5”)是取出age>=5的记录,不要理解反了
df.filter("id = 1 or c1 = 'b'" ).show()
val id_col ="col =1"
df.filter("$id_col==2")
3.2 where
df.where("age>10").describe.show(10)
3.3 对指定字段进行特殊处理
df.selectExpr("id" , "c3 as time" , "round(c4) as c4" ).show(false)
4 填充缺失值
对连续值进行缺失值填充
1离散值默认填充other,连续值填充均值
def fillDataframeNumberMeanNa(df:DataFrame,num_df:DataFrame,obj_col:Array[String]):DataFrame={
println("填充缺失值-离散值默认填充other,连续值填充均值")
var df_temp=df.na.fill(value="other",cols=obj_col)
df_temp=df_temp.na.fill(num_df.columns.zip(
num_df.select(num_df.columns.map(mean(_)): _*).first.toSeq
).toMap)
df_temp
}
2离散值默认填充other,连续值填充max
def fillDataframeNumberMaxNa(df:DataFrame,num_df:DataFrame,obj_col:Array[String]):DataFrame={
println("填充缺失值-离散值默认填充other,连续值填充均值")
var df_temp=df.na.fill(value="other",cols=obj_col)
df_temp=df_temp.na.fill(num_df.columns.zip(
num_df.select(num_df.columns.map(max(_)): _*).first.toSeq
).toMap)
df_temp
}
3 填充其他依次类推
也可指定值填充
val map_dict = Map("poi_type1"-> "other","poi_type2"-> "other")
`var TrainFillData= TrainData.na.fill(map_dict)
5 相关性分析
5.1 pearson
和label的相关性(label为连续值),feature列也为连续值
// threshold为阈值
val featCols = dfTrain.columns.filter(dfTrain
.stat
.corr(_,label_col,"pearson")
.abs > threshold
).filter(_!="cardio")
需要注意的是,官方文档中强调:
Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson* Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in* MLlib's Statistics.
6 标准化
6.1 onehot
对分类型变量首先进行stringindex转化,将字符型变量变为数字型变量
val onehot_col=Array(
"gender"
,"cholesterol"
,"gluc"
,"smoke"
,"alco"
)
val onehotcolToInt=onehot_col.map(col=>col+"ToInt")
val standardIndex=onehot_col.map{line=>
new StringIndexer().setInputCol(line).setOutputCol(line+"ToInt")
}
val vectorAssembler = new VectorAssembler()
setInputCols(onehot_colToInt++Array("col1","col2"))
setOutputCol("features")
val pipelineFinal = new Pipeline()
setStages(standardIndex++Array(vectorAssembler))
val modelFinal = pipelineFinal.fit(scaledfTrain)
val scaledfTrain1=modelFinal.transform(scaledfTrain)
val scaleDfTest1 = modelFinal.transform(scaleDfTest)
然后使用 OneHotEncoder()进行onehot
val encoder = new OneHotEncoder().setInputCol(indexer.getOutputCol).setOutputCol(s"${cate}classVec")
6.2 zscore
注意⚠️ ,StandardScaler()传入参数必须是向量
(org.apache.spark.ml.linalg.Vector),所以在进行标准化之前,需要先用 new VectorAssembler()转化为向量的形式
val scale_col=Array(
"age"
,"height"
,"weight"
,"ap_hi"
,"ap_lo"
)
val vectorScale = new VectorAssembler()
.setInputCols(scale_col)
.setOutputCol("feaScale")
val scale=new StandardScaler().setInputCol("feaScale").setOutputCol("sfea")
val pipeline = new Pipeline().setStages(Array(vectorScale,scale))
val model = pipeline.fit(dfTrain)
val scaledfTrain=model.transform(dfTrain)
val scaleDfTest = model.transform(dfValid)
6.3 minmax标准化
对onehot离散过的向量进行minmax,依然是minmax
val scale=new MinMaxScalerModel().setInputCol("feaScale").setOutputCol("sfea")