ML预测婴儿生存几率 + 超参调优(Train-validation)

# 加载SqparkSession
from pyspark.sql import SparkSession
# 加载sql类型
import pyspark.sql.types as typ
import pyspark.ml.feature as ft
# 加载分类
import pyspark.ml.classification as cl
# 加载管道
from pyspark.ml import Pipeline
# 加载评估.evaluation包
import pyspark.ml.evaluation as ev
# 加载管道模型
from pyspark.ml import PipelineModel
# 加载tune包
import pyspark.ml.tuning as tune
# 实例化spark
spark = SparkSession.builder.master('local').appName('ML预测').getOrCreate()
labels = [
    ('INFANT_ALIVE_AT_REPORT', typ.IntegerType()),
    ('BIRTH_PLACE', typ.StringType()),
    ('MOTHER_AGE_YEARS', typ.IntegerType()),
    ('FATHER_COMBINED_AGE', typ.IntegerType()),
    ('CIG_BEFORE', typ.IntegerType()),
    ('CIG_1_TRI', typ.IntegerType()),
    ('CIG_2_TRI', typ.IntegerType()),
    ('CIG_3_TRI', typ.IntegerType()),
    ('MOTHER_HEIGHT_IN', typ.IntegerType()),
    ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
    ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
    ('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
    ('DIABETES_PRE', typ.IntegerType()),
    ('DIABETES_GEST', typ.IntegerType()),
    ('HYP_TENS_PRE', typ.IntegerType()),
    ('HYP_TENS_GEST', typ.IntegerType()),
    ('PREV_BIRTH_PRETERM', typ.IntegerType())
]
schema = typ.StructType([
    typ.StructField(e[0], e[1], False) for e in labels
])
births = spark.read.csv('file:///Program Files/Pyproject/pyspark/data/births_transformed.csv.gz',
                       header=True,
                       schema=schema)
# 独热编码 ————使用OneHotEncode
births = births.withColumn('BIRTH_PLACE_INT',
                          births['BIRTH_PLACE'].cast(typ.IntegerType()))
# 创建转换器
encoder = ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT',
                          outputCol='BIRTH_PLACE_VEC')
# 使用VectorAssembler方法:
featuresCreator = ft.VectorAssembler(
    inputCols=[col[0] 
               for col in labels[2:]]
    + [encoder.getOutputCol()],
    outputCol='features'
)
# 创建评估器
logistic = cl.LogisticRegression(maxIter=10,
                                regParam=0.01,
                                labelCol='INFANT_ALIVE_AT_REPORT')
# 创建管道
pipline = Pipeline(stages=[encoder,
                          featuresCreator,
                          logistic])
# 拟合模型
births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)
# 运行管道并评估模型
model = pipline.fit(births_train)
test_model = model.transform(births_test)
test_model.take(1)
[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=13, FATHER_COMBINED_AGE=99, CIG_BEFORE=0, CIG_1_TRI=0, CIG_2_TRI=0, CIG_3_TRI=0, MOTHER_HEIGHT_IN=66, MOTHER_PRE_WEIGHT=133, MOTHER_DELIVERY_WEIGHT=135, MOTHER_WEIGHT_GAIN=2, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0, BIRTH_PLACE_INT=1, BIRTH_PLACE_VEC=SparseVector(9, {1: 1.0}), features=SparseVector(24, {0: 13.0, 1: 99.0, 6: 66.0, 7: 133.0, 8: 135.0, 9: 2.0, 16: 1.0}), rawPrediction=DenseVector([1.0573, -1.0573]), probability=DenseVector([0.7422, 0.2578]), prediction=0.0)]
# 评估模型
evalutor = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
                                           labelCol='INFANT_ALIVE_AT_REPORT'
                                           )
print(evalutor.evaluate(test_model,
                       {evalutor.metricName: 'areaUnderROC'}))
print(evalutor.evaluate(test_model,
                       {evalutor.metricName: 'areaUnderPR'}))
0.7401301847095617
0.7139354342365674
Train-validation划分

TrainValidationSplit模型对输入的数据集(训练数据集)随机划分,并分成两个子集:较小的训练集和验证集。划分仅执行一次

selector = ft.ChiSqSelector(numTopFeatures=5,
                           featuresCol=featuresCreator.getOutputCol(),
                           outputCol='selectedFeatures',
                           labelCol='INFANT_ALIVE_AT_REPORT')
# numTopFeatures指定要返回的特征的数量
logistic = cl.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT',
                                featuresCol='selectedFeatures') 
pipline = Pipeline(stages=[encoder, featuresCreator, selector])
data_tranformer = pipline.fit(births_train)
grid = tune.ParamGridBuilder().addGrid(logistic.maxIter,
                                      [2, 10, 50]).addGrid(logistic.regParam,
                                                          [0.01, 0.05, 0.3]).build()
tvs = tune.TrainValidationSplit(estimator=logistic,
                                estimatorParamMaps=grid, 
                                evaluator=evalutor)
tvsModel = tvs.fit(data_tranformer.transform(births_train))
data_train = data_tranformer.transform(births_test)
results = tvsModel.transform(data_train)
print(evalutor.evaluate(results, {evalutor.metricName: 'areaUnderROC'}))
print(evalutor.evaluate(results, {evalutor.metricName: 'areaUnderPR'}))
0.7281442007082886
0.7028311785316211
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值