利用pyspark进行建模及调参

本文将UCI中breast_cancer数据集转为spark_df进行演示,讲解了利用pyspark进行建模及调参的流程和方法。

 

from sklearn import datasets
import pandas as pd 
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import pyspark.ml.tuning as tune
from pyspark.ml import Transformer,Pipeline
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier
from pyspark.ml.feature import VectorAssembler

cancer=datasets.load_breast_cancer()
X,y=cancer.data,cancer.target
columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15','f16','f17','f18','f19','f20','f21','f22','f23','f24','f25','f26','f27','f28','f29','f30']
df=pd.concat([pd.DataFrame(X,columns= columns),pd.DataFrame(y,columns=['label'])],axis=1)
spark_df  = spark.createDataFrame(df.values.tolist(), df.columns.tolist())

预处理及划分数据集

vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
pipeline = Pipeline(stages=[vecAssembler])
pipelineFit = pipeline.fit(spark_df)
dataset = pipelineFit.transform(spark_df)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
>>Training Dataset Count: 381
>>Test Dataset Count: 188

逻辑回归默认参数训练及预测

# 模型训练
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
# 模型预测
prediction = lrModel.transform(testData)
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability')
print('areaUnderROC:', evaluator.evaluate(prediction, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(prediction, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',prediction.filter(prediction.label == prediction.prediction).count()/prediction.count())
>>areaUnderROC: 0.9819204980842913
>>areaUnderPR: 0.9856307317265275
>>accuracy: 0.9308510638297872

逻辑回归模型调参

# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',metricName='areaUnderROC')
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=3)
cvModel = cv.fit(trainingData)

输出最优参数

results = [
    ([
        {key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())
    ], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
]

sorted(results, key=lambda el:el[1], reverse=True)[0]
>>([{'regParam': 0.1}, {'elasticNetParam': 0.0}], 0.9953622902152315)

修改为最优参数进行预测

lr_new = LogisticRegression(maxIter=20, regParam=0.1, elasticNetParam=0)
lrModel_new = lr_new.fit(trainingData)
# 模型预测
prediction_new = lrModel_new.transform(testData)
print('areaUnderROC:', evaluator.evaluate(prediction_new, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(prediction_new, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',prediction_new.filter(prediction_new.label == prediction_new.prediction).count()/prediction_new.count())
>>areaUnderROC: 0.9855124521072797
>>areaUnderPR: 0.9877182758155769
>>accuracy: 0.9468085106382979

可以发现评估指标均有提高。

再尝试下随机森林

rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5])
                          .addGrid(rf.maxDepth, [3, 5, 7, 10])
                          .addGrid(rf.maxBins, [20, 30, 40])
                          .build())
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',metricName='areaUnderROC')
cv = CrossValidator(estimator=rf,
                    evaluator=evaluator,
                    estimatorParamMaps=grid,
                    numFolds=3)
cvModel_rf = cv.fit(trainingData)

# 模型预测 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)

输出最优参数

results = [
    ([
        {key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())
    ], metric) for params, metric in zip(cvModel_rf.getEstimatorParamMaps(), cvModel_rf.avgMetrics)
]

sorted(results, key=lambda el:el[1], reverse=True)[0]
>>([{'numTrees': 5}, {'maxDepth': 3}, {'maxBins': 40}], 0.9851099948526418)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值