python版传送门:https://www.kesci.com/home/project/5da974e9c83fb400420f77d3
package dataclear
/**
* @CreateUser: eshter
* @CreateDate: 2019/10/23
* @UpdateUser:
*/
import utils.session.IgnoreErrorAndINFO
import org.apache.spark.sql.{
DataFrame, SparkSession}
import org.apache.spark.ml.classification.{
LogisticRegression}
import org.apache.spark.ml.feature.{
StandardScaler, VectorAssembler, _}
import utils.metrics.Metrics
import org.apache.spark.ml.Pipeline
object cardioTrainLr {
/*
注意:
1、label =cardio
2、StandardScaler 只支持输入向量(org.spark.ml.linalg.Vector)的数据
3、数据的连续型变量为Array(
"age"
,"height"
,"weight"
,"ap_hi"
,"ap_lo"
)
4、数据的离散型变量为
Array(
"gender"
,"cholesterol"
,"gluc"
,"smoke"
,"alco"
)
*/
new IgnoreErrorAndINFO().ignoreErrorAndInfo()
def splitData(df:DataFrame,splitRate:Double)={
val dfTmp = df.randomSplit(Array(splitRate,1-splitRate),seed=2)
List(dfTmp(0),dfTmp(1))
}
def featureHandleTest(dfTrain:DataFrame,dfValid:DataFrame

本文探讨了如何利用Spark框架,结合特征工程和算法技术,对心血管疾病进行预测。提供了从数据预处理到模型构建的详细流程,旨在提高预测的准确性和效率。
最低0.47元/天 解锁文章
2305

被折叠的 条评论
为什么被折叠?



