import org.apache.spark.mllib.classification.{ClassificationModel, LogisticRegressionWithSGD, NaiveBayes, SVMWithSGD}
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.optimization.{SquaredL2Updater, Updater}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.configuration.Algo
import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by zgr on 2017/3/14.
*/
object ClassificationDemo {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("classification").setMaster("spark://10.149.252.106:7077");
val sc = new SparkContext(sparkConf);
//将训练数据读入RDD并且进行检
val rawData = sc.textFile("hdfs://10.149.252.106:9000/input/train_classification.tsv");
var records = rawData.map(_.split("\t"));
//数据处理
val data = records.map{r =>
val trimmed = r.map(_.replaceAll("\"", ""));
val label = trimmed(r.size - 1).toInt;
val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
LabeledPoint(label, Vectors.dense(features))//返回
}
val numdata = data.count();
print(numdata+"=================================================");
//数值数据中包含负的特征值,朴素贝叶斯模型要求特征值非负,,,将负特征值设为0
val nbData = records.map{r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d < 0) 0.0 else d)
LabeledPoint(label, Vectors.dense(features))
}
//为逻辑回归和SVM设置迭代次数,为决策树设置最大树深度
val numIterations = 10;
val maxTreeDepth = 5;
//首先训练逻辑回归模型
val logisticModel = LogisticRegressionWithSGD.train(data,numIterations);
println("======logistic==="+logisticModel)
//再训练SVM模型
val svmModel = SVMWithSGD.train(data,numIterations);
println(svmModel)
//训练朴素贝叶斯
val nbModel = NaiveBayes.train(nbData);
println(nbModel)
//训练决策树
val dtModel = DecisionTree.train(data,Algo.Classification,Entropy,maxTreeDepth);
println(dtModel)
//预测准确率和错误率
val lrTotalCorrect = data.map{point =>
if(logisticModel.predict(point.features) == point.label) 1 else 0
}.sum();
val lrAccuracy = lrTotalCorrect /numdata;
println("**********************lrAccuracy="+lrAccuracy+"========================================");
// compute accuracy for the other models
val svmTotalCorrect = data.map { point =>
if (svmModel.predict(point.features) == point.label) 1 else 0
}.sum
val nbTotalCorrect = nbData.map { point =>
if (nbModel.predict(point.features) == point.label) 1 else 0
}.sum
val dtTotalCorrect = data.map { point =>
val score = dtModel.predict(point.features)
val predicted = if (score > 0.5) 1 else 0//决策树的预测阈
if (predicted == point.label) 1 else 0
}.sum
val svmAccuracy = svmTotalCorrect / numdata
// svmAccuracy: Double = 0.5146720757268425
val nbAccuracy = nbTotalCorrect / numdata
// nbAccuracy: Double = 0.5803921568627451
val dtAccuracy = dtTotalCorrect / numdata
// dtAccuracy: Double = 0.6482758620689655
println("**********************svmAccuracy="+svmAccuracy+"========================================");
println("**********************nbAccuracy="+nbAccuracy+"========================================");
println("**********************dtAccuracy="+dtAccuracy+"========================================");
//改进性能,调参
//将特征向量用RowMatrix类表示成MLlib中的分布矩阵
val vectors = data.map(lp => lp.features);
val matrix = new RowMatrix(vectors);
val matrixSummary = matrix.computeColumnSummaryStatistics();//可以输出很多关于矩阵的信息 比如每列最小值,最大值、、
//可以对每个特征进行标准化,使得每个特征是0均值和单位标准差
//具体做法是对每个特征值减去列的均值,然后除以列的标准差以进行缩放
//实际上,我们可以对数据集中每个特征向量,与均值向量按项依次做减法,然后依次按项除以特征的标准差向量。标准差向量可以由方差向量的每项求平方根得到。
//使用StandardScaler 传入两个参数,一个表示是否从数据中减去均值,另一个表示是否应用标准差缩放
val scaler = new StandardScaler(withMean = true,withStd = true).fit(vectors);
val scaledData = data.map(lp => LabeledPoint(lp.label,scaler.transform(lp.features)));
//现在我们使用标准化的数据重新训练模型 决策树和朴素贝叶斯不受特征标准话的影响),
val lrModelScaled = LogisticRegressionWithSGD.train(scaledData, numIterations)
val lrTotalCorrectScaled = scaledData.map { point =>
if (lrModelScaled.predict(point.features) == point.label) 1 else 0
}.sum
val lrAccuracyScaled = lrTotalCorrectScaled / numdata
println("**********************标准话前:="+lrAccuracy+"========================================");
println("=====================标准化后logistic:"+lrAccuracyScaled);
val lrPredictionsVsTrue = scaledData.map { point =>
(lrModelScaled.predict(point.features), point.label)
}
val lrMetricsScaled = new BinaryClassificationMetrics(lrPredictionsVsTrue)
val lrPr = lrMetricsScaled.areaUnderPR
val lrRoc = lrMetricsScaled.areaUnderROC
println(f"${lrModelScaled.getClass.getSimpleName}\nAccuracy: ${lrAccuracyScaled * 100}%2.4f%%\nArea under PR: ${lrPr * 100.0}%2.4f%%\nArea under ROC: ${lrRoc * 100.0}%2.4f%%")
//类别特征对性能影响
// val categories = records.map(r => r(3)).distinct.collect.zipWithIndex.toMap;
// val numCategories = categories.size;//得到类别种类数量 14个
// //创建一个长为14的向量来表示类别特征,然后根据每个样本所属类别索引,对相应的维度赋值为1,其他为0
// // numCategories: Int = 14
// val dataCategories = records.map { r =>
// val trimmed = r.map(_.replaceAll("\"", ""))
// val label = trimmed(r.size - 1).toInt
// val categoryIdx = categories(r(3))
// val categoryFeatures = Array.ofDim[Double](numCategories)
// categoryFeatures(categoryIdx) = 1.0
// val otherFeatures = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
// val features = categoryFeatures ++ otherFeatures
// LabeledPoint(label, Vectors.dense(features))
// }
// println("===========================dataCategories.first:"+dataCategories.first);
//参数调优,迭代次数,步长,正则化
//决策树,最大深度影响
val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20).map{param =>
val model = trainDTWithParams(data,param,Entropy);
val scoreAndLabels = data.map{point =>
val score = model.predict(point.features)
(if (score > 0.5) 1.0 else 0.0, point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(s"$param tree depth", metrics.areaUnderROC)
}
dtResultsEntropy.foreach{case(param,auc) =>
println(f"$param, AUC = ${auc * 100}%2.2f%%")
}
//采用Gini不纯度进行类似
val dtResultsGini = Seq(1, 2, 3, 4, 5, 10, 20).map { param =>
val model = trainDTWithParams(data, param, Gini)
val scoreAndLabels = data.map { point =>
val score = model.predict(point.features)
(if (score > 0.5) 1.0 else 0.0, point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(s"$param tree depth", metrics.areaUnderROC)
}
dtResultsGini.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") }
//贝叶斯,通过改变不同的lambda
val nbResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map{param =>
val model = trainNBWithParams(nbData,param);
val scoreAndLabels = nbData.map{point =>
(model.predict(point.features),point.label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(s"$param lambda", metrics.areaUnderROC)
}
nbResults.foreach{case(param,auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}
println("============================贝叶斯调参=======================================")
//交叉验证
//create a 60% / 40% train/test data split
val trainTestSplit = data.randomSplit(Array(0.6,0.4),123)
val train = trainTestSplit(0)
val test = trainTestSplit(1)
//在不同的正则化参数下评估模型的性能
//测试集
val regResultsTest = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param =>
val model = trainWithParams(train, param, numIterations, new SquaredL2Updater, 1.0)
createMetrics(s"$param L2 regularization parameter", test, model)
}
regResultsTest.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.6f%%") }
println("============================测试集=========================")
/*
* 0.0 L2 regularization parameter, AUC = 50.168509%
0.001 L2 regularization parameter, AUC = 50.168509%
0.0025 L2 regularization parameter, AUC = 50.168509%
0.005 L2 regularization parameter, AUC = 50.168509%
0.01 L2 regularization parameter, AUC = 50.168509%
============================测试集=========================
* */
//训练集
// val regResultsTrain = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param =>
// val model = trainWithParams(train, param, numIterations, new SquaredL2Updater, 1.0)
// createMetrics(s"$param L2 regularization parameter", train, model)
// }
// regResultsTrain.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.6f%%") }
// println("============================训练集=========================")
/* 0.0 L2 regularization parameter, AUC = 50.124190%
0.001 L2 regularization parameter, AUC = 50.124190%
0.0025 L2 regularization parameter, AUC = 50.124190%
0.005 L2 regularization parameter, AUC = 50.124190%
0.01 L2 regularization parameter, AUC = 50.124190%
============================训练集=========================*/
}
//,定义辅助函数,根据给定输入训练模型:
def trainWithParams(input: RDD[LabeledPoint], regParam: Double, numIterations: Int, updater: Updater, stepSize: Double) = {
val lr = new LogisticRegressionWithSGD
lr.optimizer.setNumIterations(numIterations).setUpdater(updater).setRegParam(regParam).setStepSize(stepSize)
lr.run(input)
}
//定义第二个辅助函数并根据输入数据和分类模型,计算相关的AUC
def createMetrics(label:String,data:RDD[LabeledPoint],model:ClassificationModel)={
val scoreAndLabels = data.map{point =>
(model.predict(point.features),point.label)
}
var metries = new BinaryClassificationMetrics(scoreAndLabels);
(label,metries.areaUnderROC())
}
def trainDTWithParams(input: RDD[LabeledPoint], maxDepth: Int, impurity: Impurity)={
DecisionTree.train(input,Algo.Classification,impurity,maxDepth);
}
def trainNBWithParams(input: RDD[LabeledPoint], lambda: Double) ={
val nb = new NaiveBayes;
nb.setLambda(lambda)
nb.run(input);
}
}
Spark分类模型--来源Spark机器学习
最新推荐文章于 2023-08-17 09:40:31 发布