# 加载SqparkSession
from pyspark.sql import SparkSession
# 加载sql类型
import pyspark.sql.types as typ
import pyspark.ml.feature as ft
# 加载分类
import pyspark.ml.classification as cl
# 加载管道
from pyspark.ml import Pipeline
# 加载评估.evaluation包
import pyspark.ml.evaluation as ev
# 加载管道模型
from pyspark.ml import PipelineModel
# 加载tune包
import pyspark.ml.tuning as tune
# 实例化spark
spark = SparkSession.builder.master('local').appName('ML预测').getOrCreate()
labels = [
('INFANT_ALIVE_AT_REPORT', typ.IntegerType()),
('BIRTH_PLACE', typ.StringType()),
('MOTHER_AGE_YEARS', typ.IntegerType()),
('FATHER_COMBINED_AGE', typ.IntegerType()),
('CIG_BEFORE', typ.IntegerType()),
('CIG_1_TRI', typ.IntegerType()),
('CIG_2_TRI', typ.IntegerType()),
('CIG_3_TRI', typ.IntegerType()),
('MOTHER_HEIGHT_IN', typ.IntegerType()),
('MOTHER_PRE_WEIGHT', typ.IntegerType()),
('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
('DIABETES_PRE', typ.IntegerType()),
('DIABETES_GEST', typ.IntegerType()),
('HYP_TENS_PRE', typ.IntegerType()),
('HYP_TENS_GEST', typ.IntegerType()),
('PREV_BIRTH_PRETERM', typ.IntegerType())
]
schema = typ.StructType([
typ.StructField(e[0], e[1], False) for e in labels
])
births = spark.read.csv('file:///Program Files/Pyproject/pyspark/data/births_transformed.csv.gz',
header=True,
schema=schema)
# 独热编码 ————使用OneHotEncode
births = births.withColumn('BIRTH_PLACE_INT',
births['BIRTH_PLACE'].cast(typ.IntegerType()))
# 创建转换器
encoder = ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT',
outputCol='BIRTH_PLACE_VEC')
# 使用VectorAssembler方法:
featuresCreator = ft.VectorAssembler(
inputCols=[col[0]
for col in labels[2:]]
+ [encoder.getOutputCol()],
outputCol='features'
)
# 创建评估器
logistic = cl.LogisticRegression(maxIter=10,
regParam=0.01,
labelCol='INFANT_ALIVE_AT_REPORT')
# 创建管道
pipline = Pipeline(stages=[encoder,
featuresCreator,
logistic])
# 拟合模型
births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)
# 运行管道并评估模型
model = pipline.fit(births_train)
test_model = model.transform(births_test)
test_model.take(1)
[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=13, FATHER_COMBINED_AGE=99, CIG_BEFORE=0, CIG_1_TRI=0, CIG_2_TRI=0, CIG_3_TRI=0, MOTHER_HEIGHT_IN=66, MOTHER_PRE_WEIGHT=133, MOTHER_DELIVERY_WEIGHT=135, MOTHER_WEIGHT_GAIN=2, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0, BIRTH_PLACE_INT=1, BIRTH_PLACE_VEC=SparseVector(9, {1: 1.0}), features=SparseVector(24, {0: 13.0, 1: 99.0, 6: 66.0, 7: 133.0, 8: 135.0, 9: 2.0, 16: 1.0}), rawPrediction=DenseVector([1.0573, -1.0573]), probability=DenseVector([0.7422, 0.2578]), prediction=0.0)]
# 评估模型
evalutor = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
labelCol='INFANT_ALIVE_AT_REPORT'
)
print(evalutor.evaluate(test_model,
{evalutor.metricName: 'areaUnderROC'}))
print(evalutor.evaluate(test_model,
{evalutor.metricName: 'areaUnderPR'}))
0.7401301847095617
0.7139354342365674
Train-validation划分
TrainValidationSplit模型对输入的数据集(训练数据集)随机划分,并分成两个子集:较小的训练集和验证集。划分仅执行一次
selector = ft.ChiSqSelector(numTopFeatures=5,
featuresCol=featuresCreator.getOutputCol(),
outputCol='selectedFeatures',
labelCol='INFANT_ALIVE_AT_REPORT')
# numTopFeatures指定要返回的特征的数量
logistic = cl.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT',
featuresCol='selectedFeatures')
pipline = Pipeline(stages=[encoder, featuresCreator, selector])
data_tranformer = pipline.fit(births_train)
grid = tune.ParamGridBuilder().addGrid(logistic.maxIter,
[2, 10, 50]).addGrid(logistic.regParam,
[0.01, 0.05, 0.3]).build()
tvs = tune.TrainValidationSplit(estimator=logistic,
estimatorParamMaps=grid,
evaluator=evalutor)
tvsModel = tvs.fit(data_tranformer.transform(births_train))
data_train = data_tranformer.transform(births_test)
results = tvsModel.transform(data_train)
print(evalutor.evaluate(results, {evalutor.metricName: 'areaUnderROC'}))
print(evalutor.evaluate(results, {evalutor.metricName: 'areaUnderPR'}))
0.7281442007082886
0.7028311785316211