以逻辑回归模型举例介绍完整的分类模型构建过程。
数据集下载:http://www.kaggle.com/c/stumbleupon
该数据集是关于网页中推荐的页面是短暂存在还是可以长时间流行的一个分类问题,目标值-1表示长久,0表示短暂。
首先将数据第一行删除,通过管道保存到以train_noheader.tsv命名的文件中
1
|
sed
1
d train.tsv > train
_
noheader.tsv
|
启动spark-shell
1
|
spark-shell --driver-memory
4
g
|
读入训练数据到RDD,并检查
1
2
3
|
val
rawData
=
sc.textFile(
"train_noheader.tsv"
)
val
records
=
rawData.map(line
=
> line.split(
"\t"
))
records.first
|
数据处理
1
2
3
4
5
6
7
8
|
import
org.apache.spark.mllib.regression.LabeledPoint
import
org.apache.spark.mllib.linalg.Vectors
val
data
=
records.map { r
=
>
val
trimmed
=
r.map(
_
.replaceAll(
"\""
,
""
))\\去掉多余的“符号
val
label
=
trimmed(r.size -
1
).toInt\\标签转化为整数
val
features
=
trimmed.slice(
4
, r.size -
1
).map(d
=
>
if
(d
==
"?"
)
0.0
else
d.toDouble)\\用
0
代替表示缺失数据的?。
LabeledPoint(label, Vectors.dense(features))\\存储标签和特征向量到Vectors中
}
|
对数据缓存和统计样本数
1
2
|
data.cache
val
numData
=
data.count
|
训练逻辑回归分类模型
1
2
|
import
org.apache.spark.mllib.classification.LogisticRegressionWithSGD
val
lrModel
=
LogisticRegressionWithSGD.train(data, numIterations)
|
使用分类模型
1
2
3
4
5
|
val
dataPoint
=
data.first
val
prediction
=
lrModel.predict(dataPoint.features)
// prediction: Double = 1.0\\预测为长久
val
trueLabel
=
dataPoint.label
// trueLabel: Double = 0.0\\实际为短暂
|
评估模型性能
预测的正确率(训练样本被正确分类的数目处于总样本数)
1
2
3
4
5
6
7
|
val
lrTotalCorrect
=
data.map { point
=
>
if
(lrModel.predict(point.features)
==
point.label)
1
else
0
}.sum
// lrTotalCorrect: Double = 3806.0
val
lrAccuracy
=
lrTotalCorrect / numData
// lrAccuracy: Double = 0.5146720757268425//51.5%的正确率,结果不太好,跟随机预测差不多
|
模型评价指标:准确率-召回率(PR)曲线和ROC曲线的面积
1
2
3
4
5
6
7
8
|
import
org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
//计算指标
val
metrics
=
Seq(lrModel, svmModel).map { model
=
>
val
scoreAndLabels
=
data.map { point
=
>
(model.predict(point.features), point.label)
}
val
metrics
=
new
BinaryClassificationMetrics(scoreAndLabels)
(model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
}
//分别计算逻辑回归和支持向量机模型的指标
|
逻辑回归模型,PR:75%,ROC:50%,效果不好
改进模型与参数调优
统计数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
import
org.apache.spark.mllib.linalg.distributed.RowMatrix
val
vectors
=
data.map(lp
=
> lp.features)
val
matrix
=
new
RowMatrix(vectors)
val
matrixSummary
=
matrix.computeColumnSummaryStatistics()
//计算特征矩阵每列的统计数据
println(matrixSummary.mean)
println(matrixSummary.min)
println(matrixSummary.max)
println(matrixSummary.variance)
println(matrixSummary.numNonzeros)
|
特征标准化
1
2
3
4
5
|
import
org.apache.spark.mllib.feature.StandardScaler
val
scaler
=
new
StandardScaler(withMean
=
true
, withStd
=
true
).fit(vectors)
//withMean和withStd设为True
val
scaledData
=
data.map(lp
=
> LabeledPoint(lp.label, scaler.transform(lp.features)))
//标准化后的数据
println(scaleData.first.features)
|
重新训练模型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
val
lrModelScaled
=
LogisticRegressionWithSGD.train(scaledData, numIterations)
val
lrTotalCorrectScaled
=
scaledData.map { point
=
>
if
(lrModelScaled.predict(point.features)
==
point.label)
1
else
0
}.sum
val
lrAccuracyScaled
=
lrTotalCorrectScaled / numData
// lrAccuracyScaled: Double = 0.6204192021636241
val
lrPredictionsVsTrue
=
scaledData.map { point
=
>
(lrModelScaled.predict(point.features), point.label)
}
val
lrMetricsScaled
=
new
BinaryClassificationMetrics(lrPredictionsVsTrue)
val
lrPr
=
lrMetricsScaled.areaUnderPR
val
lrRoc
=
lrMetricsScaled.areaUnderROC
println(f
"${lrModelScaled.getClass.getSimpleName}\nAccuracy: ${lrAccuracyScaled * 100}%2.4f%%\nArea under PR: ${lrPr * 100.0}%2.4f%%\nArea under ROC: ${lrRoc * 100.0}%2.4f%%"
)
/*
LogisticRegressionModel
Accuracy: 62.0419%
Area under PR: 72.7254%
Area under ROC: 61.9663%
*/
/简单的对特征标准化,提高了准确率
|
考虑其他特征,未使用category 和boilerplate 列的内容
添加category,对每个类别做一个索引,可以用1-of-k编码。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
val
categories
=
records.map(r
=
> r(
3
)).distinct.collect.zipWithIndex.toMap
// categories: scala.collection.immutable.Map[String,Int] = Map("weather" -> 0, "sports" -> 6,
// "unknown" -> 4, "computer_internet" -> 12, "?" -> 11, "culture_politics" -> 3, "religion" -> 8,
// "recreation" -> 2, "arts_entertainment" -> 9, "health" -> 5, "law_crime" -> 10, "gaming" -> 13,
// "business" -> 1, "science_technology" -> 7)
val
numCategories
=
categories.size
// numCategories: Int = 14
val
dataCategories
=
records.map { r
=
>
val
trimmed
=
r.map(
_
.replaceAll(
"\""
,
""
))
val
label
=
trimmed(r.size -
1
).toInt
val
categoryIdx
=
categories(r(
3
))
val
categoryFeatures
=
Array.ofDim[Double](numCategories)
categoryFeatures(categoryIdx)
=
1.0
val
otherFeatures
=
trimmed.slice(
4
, r.size -
1
).map(d
=
>
if
(d
==
"?"
)
0.0
else
d.toDouble)
val
features
=
categoryFeatures ++ otherFeatures
LabeledPoint(label, Vectors.dense(features))
}
println(dataCategories.first)
|
1
2
3
|
// LabeledPoint(0.0, [0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,
// 0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,
// 0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])
|
标准化
1
2
3
4
5
6
7
8
9
10
11
12
13
|
val
scalerCats
=
new
StandardScaler(withMean
=
true
, withStd
=
true
).fit(dataCategories.map(lp
=
> lp.features))
val
scaledDataCats
=
dataCategories.map(lp
=
> LabeledPoint(lp.label, scalerCats.transform(lp.features)))
println(scaledDataCats.first.features)
/*
[-0.023261105535492967,2.720728254208072,-0.4464200056407091,-0.2205258360869135,-0.028492999745483565,
-0.2709979963915644,-0.23272692307249684,-0.20165301179556835,-0.09914890962355712,-0.381812077600508,
-0.06487656833429316,-0.6807513271391559,-0.2041811690290381,-0.10189368073492189,1.1376439023494747,
-0.08193556218743517,1.0251347662842047,-0.0558631837375738,-0.4688883677664047,-0.35430044806743044
,-0.3175351615705111,0.3384496941616097,0.0,0.8288021759842215,-0.14726792180045598,0.22963544844991393,
-0.14162589530918376,0.7902364255801262,0.7171932152231301,-0.29799680188379124,-0.20346153667348232,
-0.03296720969318916,-0.0487811294839849,0.9400696843533806,-0.10869789547344721,-0.2788172632659348]
*/
|
再次训练模型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
val
lrModelScaledCats
=
LogisticRegressionWithSGD.train(scaledDataCats, numIterations)
val
lrTotalCorrectScaledCats
=
scaledDataCats.map { point
=
>
if
(lrModelScaledCats.predict(point.features)
==
point.label)
1
else
0
}.sum
val
lrAccuracyScaledCats
=
lrTotalCorrectScaledCats / numData
val
lrPredictionsVsTrueCats
=
scaledDataCats.map { point
=
>
(lrModelScaledCats.predict(point.features), point.label)
}
val
lrMetricsScaledCats
=
new
BinaryClassificationMetrics(lrPredictionsVsTrueCats)
val
lrPrCats
=
lrMetricsScaledCats.areaUnderPR
val
lrRocCats
=
lrMetricsScaledCats.areaUnderROC
println(f
"${lrModelScaledCats.getClass.getSimpleName}\nAccuracy: ${lrAccuracyScaledCats * 100}%2.4f%%\nArea under PR: ${lrPrCats * 100.0}%2.4f%%\nArea under ROC: ${lrRocCats * 100.0}%2.4f%%"
)
/*
LogisticRegressionModel
Accuracy: 66.5720%
Area under PR: 75.7964%
Area under ROC: 66.5483%
*/
//准确率有所提升
|
模型参数调优
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import
org.apache.spark.rdd.RDD
import
org.apache.spark.mllib.optimization.Updater
import
org.apache.spark.mllib.optimization.SimpleUpdater
import
org.apache.spark.mllib.optimization.L
1
Updater
import
org.apache.spark.mllib.optimization.SquaredL
2
Updater
import
org.apache.spark.mllib.classification.ClassificationModel
// 辅助函数,根据给定数据输入模型
def
trainWithParams(input
:
RDD[LabeledPoint], regParam
:
Double, numIterations
:
Int, updater
:
Updater, stepSize
:
Double)
=
{
val
lr
=
new
LogisticRegressionWithSGD
lr.optimizer.setNumIterations(numIterations).setUpdater(updater).setRegParam(regParam).setStepSize(stepSize)
lr.run(input)
}
// 辅助函数,根据输入数据和分类模型,计算AUC
def
createMetrics(label
:
String, data
:
RDD[LabeledPoint], model
:
ClassificationModel)
=
{
val
scoreAndLabels
=
data.map { point
=
>
(model.predict(point.features), point.label)
}
val
metrics
=
new
BinaryClassificationMetrics(scoreAndLabels)
(label, metrics.areaUnderROC)
}
|
缓存数据
1
|
scaledDataCats.cache
|
设置不同迭代次数
1
2
3
4
5
6
7
8
9
10
11
|
val
iterResults
=
Seq(
1
,
5
,
10
,
50
).map { param
=
>
val
model
=
trainWithParams(scaledDataCats,
0.0
, param,
new
SimpleUpdater,
1.0
)
createMetrics(s
"$param iterations"
, scaledDataCats, model)
}
iterResults.foreach {
case
(param, auc)
=
> println(f
"$param, AUC = ${auc * 100}%2.2f%%"
) }
/*
1 iterations, AUC = 64.97%
5 iterations, AUC = 66.62%
10 iterations, AUC = 66.55%
50 iterations, AUC = 66.81%//达到某个次数,结果影响变小
*/
|
设置不同步长
1
2
3
4
5
6
7
8
9
10
11
12
|
val
stepResults
=
Seq(
0.001
,
0.01
,
0.1
,
1.0
,
10.0
).map { param
=
>
val
model
=
trainWithParams(scaledDataCats,
0.0
, numIterations,
new
SimpleUpdater, param)
createMetrics(s
"$param step size"
, scaledDataCats, model)
}
stepResults.foreach {
case
(param, auc)
=
> println(f
"$param, AUC = ${auc * 100}%2.2f%%"
) }
/*
0.001 step size, AUC = 64.95%
0.01 step size, AUC = 65.00%
0.1 step size, AUC = 65.52%
1.0 step size, AUC = 66.55%
10.0 step size, AUC = 61.92%//步长过大反而更不准确
*/
|
正则化,不同的正则参数
1
2
3
4
5
6
7
8
9
10
11
12
|
val
regResults
=
Seq(
0.001
,
0.01
,
0.1
,
1.0
,
10.0
).map { param
=
>
val
model
=
trainWithParams(scaledDataCats, param, numIterations,
new
SquaredL
2
Updater,
1.0
)
createMetrics(s
"$param L2 regularization parameter"
, scaledDataCats, model)
}
regResults.foreach {
case
(param, auc)
=
> println(f
"$param, AUC = ${auc * 100}%2.2f%%"
) }
/*
0.001 L2 regularization parameter, AUC = 66.55%
0.01 L2 regularization parameter, AUC = 66.55%
0.1 L2 regularization parameter, AUC = 66.63%
1.0 L2 regularization parameter, AUC = 66.04%
10.0 L2 regularization parameter, AUC = 35.33%//采用L2正则化
*/
|
交叉验证
1
2
3
|
val
trainTestSplit
=
scaledDataCats.randomSplit(Array(
0.6
,
0.4
),
123
)
//六四分
val
train
=
trainTestSplit(
0
)
val
test
=
trainTestSplit(
1
)
|
调整正则化参数
1
2
3
4
5
6
7
8
9
10
11
12
|
val
regResultsTest
=
Seq(
0.0
,
0.001
,
0.0025
,
0.005
,
0.01
).map { param
=
>
val
model
=
trainWithParams(train, param, numIterations,
new
SquaredL
2
Updater,
1.0
)
createMetrics(s
"$param L2 regularization parameter"
, test, model)
}
regResultsTest.foreach {
case
(param, auc)
=
> println(f
"$param, AUC = ${auc * 100}%2.6f%%"
) }
/*
0.0 L2 regularization parameter, AUC = 66.480874%
0.001 L2 regularization parameter, AUC = 66.480874%
0.0025 L2 regularization parameter, AUC = 66.515027%
0.005 L2 regularization parameter, AUC = 66.515027%
0.01 L2 regularization parameter, AUC = 66.549180%
*/
|
再计算测试集
1
2
3
4
5
6
7
8
9
10
11
12
|
val
regResultsTrain
=
Seq(
0.0
,
0.001
,
0.0025
,
0.005
,
0.01
).map { param
=
>
val
model
=
trainWithParams(train, param, numIterations,
new
SquaredL
2
Updater,
1.0
)
createMetrics(s
"$param L2 regularization parameter"
, train, model)
}
regResultsTrain.foreach {
case
(param, auc)
=
> println(f
"$param, AUC = ${auc * 100}%2.6f%%"
) }
/*
0.0 L2 regularization parameter, AUC = 66.260311%
0.001 L2 regularization parameter, AUC = 66.260311%
0.0025 L2 regularization parameter, AUC = 66.260311%
0.005 L2 regularization parameter, AUC = 66.238294%
0.01 L2 regularization parameter, AUC = 66.238294%
*/
|
正则化参数较小,效果较好,但容易过拟合。
交叉验证中,一般选择测试集中表现最好的参数。然后进行新数据集的预测。