本文将UCI中breast_cancer数据集转为spark_df进行演示,讲解了利用pyspark进行建模及调参的流程和方法。
from sklearn import datasets
import pandas as pd
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import pyspark.ml.tuning as tune
from pyspark.ml import Transformer,Pipeline
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
cancer=datasets.load_breast_cancer()
X,y=cancer.data,cancer.target
columns=['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15','f16','f17','f18','f19','f20','f21','f22','f23','f24','f25','f26','f27','f28','f29','f30']
df=pd.concat([pd.DataFrame(X,columns= columns),pd.DataFrame(y,columns=['label'])],axis=1)
spark_df = spark.createDataFrame(df.values.tolist(), df.columns.tolist())
预处理及划分数据集
vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
pipeline = Pipeline(stages=[vecAssembler])
pipelineFit = pipeline.fit(spark_df)
dataset = pipelineFit.transform(spark_df)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
>>Training Dataset Count: 381
>>Test Dataset Count: 188
逻辑回归默认参数训练及预测
# 模型训练
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
# 模型预测
prediction = lrModel.transform(testData)
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability')
print('areaUnderROC:', evaluator.evaluate(prediction, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(prediction, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',prediction.filter(prediction.label == prediction.prediction).count()/prediction.count())
>>areaUnderROC: 0.9819204980842913
>>areaUnderPR: 0.9856307317265275
>>accuracy: 0.9308510638297872
逻辑回归模型调参
# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
.addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
.addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
.build())
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',metricName='areaUnderROC')
# Create 3-fold CrossValidator
cv = CrossValidator(estimator=lr,
estimatorParamMaps=grid,
evaluator=evaluator,
numFolds=3)
cvModel = cv.fit(trainingData)
输出最优参数
results = [
([
{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())
], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
]
sorted(results, key=lambda el:el[1], reverse=True)[0]
>>([{'regParam': 0.1}, {'elasticNetParam': 0.0}], 0.9953622902152315)
修改为最优参数进行预测
lr_new = LogisticRegression(maxIter=20, regParam=0.1, elasticNetParam=0)
lrModel_new = lr_new.fit(trainingData)
# 模型预测
prediction_new = lrModel_new.transform(testData)
print('areaUnderROC:', evaluator.evaluate(prediction_new, {evaluator.metricName:'areaUnderROC'}))
print('areaUnderPR:', evaluator.evaluate(prediction_new, {evaluator.metricName: 'areaUnderPR'}))
# 计算准确率
print('accuracy:',prediction_new.filter(prediction_new.label == prediction_new.prediction).count()/prediction_new.count())
>>areaUnderROC: 0.9855124521072797
>>areaUnderPR: 0.9877182758155769
>>accuracy: 0.9468085106382979
可以发现评估指标均有提高。
再尝试下随机森林
rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5])
.addGrid(rf.maxDepth, [3, 5, 7, 10])
.addGrid(rf.maxBins, [20, 30, 40])
.build())
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability',metricName='areaUnderROC')
cv = CrossValidator(estimator=rf,
evaluator=evaluator,
estimatorParamMaps=grid,
numFolds=3)
cvModel_rf = cv.fit(trainingData)
# 模型预测 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)
输出最优参数
results = [
([
{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())
], metric) for params, metric in zip(cvModel_rf.getEstimatorParamMaps(), cvModel_rf.avgMetrics)
]
sorted(results, key=lambda el:el[1], reverse=True)[0]
>>([{'numTrees': 5}, {'maxDepth': 3}, {'maxBins': 40}], 0.9851099948526418)