《深度实践Spark机器学习》第8章 构建Spark ML分类模型

8.3 数据探索
spark-shell --num-executors 1 --total-executor-cores 3 --executor-memory 512m

val Path = "hdfs://uti04.utidata.com:8020/u01/bigdata/"
val raw_data = sc.textFile(Path + "data/train_noheader.tsv")
# raw_data.take(2)

val records = raw_data.map(line => line.split('\t'))

records.first

records.count

records.first.size

records.first.take(2)



8.4 数据预处理
import org.apache.spark.ml.feature.{LabeledPoint, IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.linalg.Vectors

# 这里注意:如果要整体复制,所有行之前都要消除空格或tab
val data = records.map{r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if(d=="?") 0.0 else d.toDouble)
LabeledPoint(label, Vectors.dense(features))
}

# 这里注意:如果要整体复制,所有行之前都要消除空格或tab
val nbData = records.map{r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if(d=="?") 0.0 else d.toDouble).map(d => if(d<0) 0.0 else d)
LabeledPoint(label, Vectors.dense(features))
}

data.take(2)

6)通过rdd创建DataFrame
val df = spark.createDataFrame(data)

val nbDF = spark.createDataFrame(nbData)

7)
scala> df.show(10)
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[0.789131,2.05555...|
|  1.0|[0.574147,3.67796...|
|  1.0|[0.996526,2.38288...|
|  1.0|[0.801248,1.54310...|
|  0.0|[0.719157,2.67647...|
|  0.0|[0.0,119.0,0.7454...|
|  1.0|[0.22111,0.773809...|
|  0.0|[0.0,1.883333333,...|
|  1.0|[0.0,0.471502591,...|
|  1.0|[0.0,2.41011236,0...|
+-----+--------------------+


# 查看nbDF的第一行数据
nbDF.head
nbDF.first

8)
scala> df.printSchema
root
|-- label: double (nullable = false)
|-- features: vector (nullable = true)


9)划分数据
val Array(trainData, testData) = df.randomSplit(Array(0.8, 0.2), seed = 1234L)

val Array(nbTrainData, nbTestData) = nbDF.randomSplit(Array(0.8, 0.2), seed = 1234L)


10)
trainData.count
testData.count

11)
trainData.cache
testData.cache
nbTrainData.cache
nbTestData.cache

12)
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}

13)创建贝叶斯模型,设置初始参数
val nb = new NaiveBayes().setLabelCol("label").setFeaturesCol("features")

14)通过朴素贝叶斯训练模型,对数据进行预测
# 训练
val nbModel = nb.fit(nbTrainData)

# 预测
val nbPrediction = nbModel.transform(nbTestData)

nbPrediction.show(10)

15)准确性统计
// t1存放预测值的数组,t2存放测试数据标签值,t3存放测试数据总行数
val (t1, t2, t3) = (nbPrediction.select("prediction").collect, nbTestData.select("label").collect, nbTestData.count.toInt)

// t4为累加器
var t4 = 0

// 循环遍历, 统计正确的预测次数
for(i <- 0 to t3-1){if(t1(i)==t2(i)) t4+=1}

// 查看预测正确的个数
t4

// 计算准确率
val nbAccuracy = 1.0 * t4 / t3


8.5 组装
1)导入特征索引类
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorIndexerModel}

2)建立特征索引
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
 
3)建立逻辑回归模型
val lr = new LogisticRegression().setLabelCol("label").setFeaturesCol("indexedFeatures").setMaxIter(10).setRegParam(0.001)

4)创建决策树模型
# setMaxBins离散化“连续特征”的最大划分数
val dt = new DecisionTreeClassifier().setLabelCol("label").setFeaturesCol("indexedFeatures").setImpurity("entropy").setMaxBins(100).setMaxDepth(5).setMinInfoGain(0.01)

5)导入网格参数和交叉验证
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}

6)导入流水线
import org.apache.spark.ml.{Pipeline, PipelineModel}

7)导入评估器
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

8)配置两条流水线:一条是逻辑回归的流水线,包含连个stages( featureIndexer 和lr);一条是决策树回归的流水线,包含连个stages( featureIndexer 和dt)
val lrPipeline = new Pipeline().setStages(Array(featureIndexer, lr))

val dtPipeline = new Pipeline().setStages(Array(featureIndexer, dt))

8.6 模型优化
1)分别配置网格参数,使用ParamGridBuilder构造一个parameter grid,注意无空格复制
val lrParamGrid = new ParamGridBuilder().addGrid(lr.regParam, Array(0.1, 0.3, 0.5)).addGrid(lr.maxIter, Array(10, 20, 30)).build()

val dtParamGrid = new ParamGridBuilder().addGrid(dt.maxDepth, Array(3, 5, 7)).build()

2)分别实例化交叉验证模型
val evaluator = new BinaryClassificationEvaluator

val lrCV = new CrossValidator().setEstimator(lrPipeline).setEvaluator(evaluator).setEstimatorParamMaps(lrParamGrid).setNumFolds(2)

val dtCV = new CrossValidator().setEstimator(dtPipeline).setEvaluator(evaluator).setEstimatorParamMaps(dtParamGrid).setNumFolds(2)

3)通过交叉验证模型,获取最优参数集,并测试模型
val lrCvModel = lrCV.fit(trainData)

val dtCvModel = dtCV.fit(trainData)

val lrPrediction = lrCvModel.transform(testData)

val dtPrediction = dtCvModel.transform(testData)



4)查看数据
lrPrediction.select("label", "prediction").show(10)

dtPrediction.select("label", "prediction").show(10)

5)查看逻辑回归匹配模型的参数
val lrBestModel = lrCvModel.bestModel.asInstanceOf[PipelineModel]

val lrModel = lrBestModel.stages(1).asInstanceOf[LogisticRegressionModel]

lrModel.getRegParam

lrModel.getMaxIter

6)查看决策树匹配模型的参数
val dtBestModel = dtCvModel.bestModel.asInstanceOf[PipelineModel]

val dtModel = dtBestModel.stages(1).asInstanceOf[DecisionTreeClassificationModel]

dtModel.getMaxDepth

dtModel.numFeatures

7)统计逻辑回归的预测正确性
// t_lr 为逻辑回归预测值的数组
// t_dt 为决策树预测值的数组
// t_label 为测试集的标签值的数组
val (t_lr, t_dt, t_label, t_count) = (lrPrediction.select("prediction").collect, dtPrediction.select("label").collect, testData.select("label").collect, testData.count.toInt)

//c_lr 为统计逻辑回归正确个数的累加器
//c_dt 为统计决策树正确个数的累加器
var Array(c_lr, c_dt) = Array(0, 0)

//遍历循环,统计逻辑回归正确个数
for(i <- 0 to t_count-1){if(t_lr(i)==t_label(i)) c_lr+=1}
c_lr

//统计逻辑回归正确性
1.0*c_lr/t_count

//遍历循环,统计决策树正确个数
for(i <- 0 to t_count-1){if(t_dt(i)==t_label(i)) c_dt+=1}
c_dt

//统计决策树正确性
1.0*c_dt/t_count
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值