python版传送门:https://www.kesci.com/home/project/5da974e9c83fb400420f77d3
package dataclear
/**
* @CreateUser: eshter
* @CreateDate: 2019/10/23
* @UpdateUser:
*/
import utils.session.IgnoreErrorAndINFO
import org.apache.spark.sql.{
DataFrame, SparkSession}
import org.apache.spark.ml.classification.{
LogisticRegression}
import org.apache.spark.ml.feature.{
StandardScaler, VectorAssembler, _}
import utils.metrics.Metrics
import org.apache.spark.ml.Pipeline
object cardioTrainLr {
/*
注意:
1、label =cardio
2、StandardScaler 只支持输入向量(org.spark.ml.linalg.Vector)的数据
3、数据的连续型变量为Array(
"age"
,"height"
,"weight"
,"ap_hi"
,"ap_lo"
)
4、数据的离散型变量为
Array(
"gender"
,"cholesterol"
,"gluc"
,"smoke"
,"alco"
)
*/
new IgnoreErrorAndINFO().ignoreErrorAndInfo()
def splitData(df:DataFrame,splitRate:Double)={
val dfTmp = df.randomSplit(Array(splitRate,1-splitRate),seed=2)
List(dfTmp(0),dfTmp(1))
}
def featureHandleTest(dfTrain:DataFrame,dfValid:DataFrame