8.3 数据探索
spark-shell --num-executors 1 --total-executor-cores 3 --executor-memory 512m
val Path = "hdfs://uti04.utidata.com:8020/u01/bigdata/"
val raw_data = sc.textFile(Path + "data/train_noheader.tsv")
# raw_data.take(2)
val records = raw_data.map(line => line.split('\t'))
records.first
records.count
records.first.size
records.first.take(2)
8.4 数据预处理
import org.apache.spark.ml.feature.{LabeledPoint, IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.linalg.Vectors
# 这里注意:如果要整体复制,所有行之前都要消除空格或tab
val data = records.map{r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if(d=="?") 0.0 else d.toDouble)
LabeledPoint(label, Vectors.dense(features))
}
# 这里注意:如果要整体复制,所有行之前都要消除空格或tab
val nbData = records.map{r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if(d=="?") 0.0 else d.toDouble).map(d => if(d<0) 0.0 else d)
LabeledPoint(label, Vectors.dense(features))
}
data.take(2)
6)通过rdd创建DataFrame
val df = spark.createDataFrame(data)
val nbDF = spark.createDataFrame(nbData)
7)
scala> df.show(10)
+-----+--------------------+
|label| features|
+-----+--------------------+
| 0.0|[0.789131,2.05555...|
| 1.0|[0.574147,3.67796...|
| 1.0|[0.996526,2.38288...|
| 1.0|[0.801248,1.54310...|
| 0.0|[0.719157,2.67647...|
| 0.0|[0.0,119.0,0.7454...|
| 1.0|[0.22111,0.773809...|
| 0.0|[0.0,1.883333333,...|
| 1.0|[0.0,0.471502591,...|
| 1.0|[0.0,2.41011236,0...|
+-----+--------------------+
# 查看nbDF的第一行数据
nbDF.head
nbDF.first
8)
scala> df.printSchema
root
|-- label: double (nullable = false)
|-- features: vector (nullable = true)
9)划分数据
val Array(trainData, testData) = df.randomSplit(Array(0.8, 0.2), seed = 1234L)
val Array(nbTrainData, nbTestData) = nbDF.randomSplit(Array(0.8, 0.2), seed = 1234L)
10)
trainData.count
testData.count
11)
trainData.cache
testData.cache
nbTrainData.cache
nbTestData.cache
12)
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}
13)创建贝叶斯模型,设置初始参数
val nb = new NaiveBayes().setLabelCol("label").setFeaturesCol("features")
14)通过朴素贝叶斯训练模型,对数据进行预测
# 训练
val nbModel = nb.fit(nbTrainData)
# 预测
val nbPrediction = nbModel.transform(nbTestData)
nbPrediction.show(10)
15)准确性统计
// t1存放预测值的数组,t2存放测试数据标签值,t3存放测试数据总行数
val (t1, t2, t3) = (nbPrediction.select("prediction").collect, nbTestData.select("label").collect, nbTestData.count.toInt)
// t4为累加器
var t4 = 0
// 循环遍历, 统计正确的预测次数
for(i <- 0 to t3-1){if(t1(i)==t2(i)) t4+=1}
// 查看预测正确的个数
t4
// 计算准确率
val nbAccuracy = 1.0 * t4 / t3
8.5 组装
1)导入特征索引类
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorIndexerModel}
2)建立特征索引
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
3)建立逻辑回归模型
val lr = new LogisticRegression().setLabelCol("label").setFeaturesCol("indexedFeatures").setMaxIter(10).setRegParam(0.001)
4)创建决策树模型
# setMaxBins离散化“连续特征”的最大划分数
val dt = new DecisionTreeClassifier().setLabelCol("label").setFeaturesCol("indexedFeatures").setImpurity("entropy").setMaxBins(100).setMaxDepth(5).setMinInfoGain(0.01)
5)导入网格参数和交叉验证
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
6)导入流水线
import org.apache.spark.ml.{Pipeline, PipelineModel}
7)导入评估器
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
8)配置两条流水线:一条是逻辑回归的流水线,包含连个stages( featureIndexer 和lr);一条是决策树回归的流水线,包含连个stages( featureIndexer 和dt)
val lrPipeline = new Pipeline().setStages(Array(featureIndexer, lr))
val dtPipeline = new Pipeline().setStages(Array(featureIndexer, dt))
8.6 模型优化
1)分别配置网格参数,使用ParamGridBuilder构造一个parameter grid,注意无空格复制
val lrParamGrid = new ParamGridBuilder().addGrid(lr.regParam, Array(0.1, 0.3, 0.5)).addGrid(lr.maxIter, Array(10, 20, 30)).build()
val dtParamGrid = new ParamGridBuilder().addGrid(dt.maxDepth, Array(3, 5, 7)).build()
2)分别实例化交叉验证模型
val evaluator = new BinaryClassificationEvaluator
val lrCV = new CrossValidator().setEstimator(lrPipeline).setEvaluator(evaluator).setEstimatorParamMaps(lrParamGrid).setNumFolds(2)
val dtCV = new CrossValidator().setEstimator(dtPipeline).setEvaluator(evaluator).setEstimatorParamMaps(dtParamGrid).setNumFolds(2)
3)通过交叉验证模型,获取最优参数集,并测试模型
val lrCvModel = lrCV.fit(trainData)
val dtCvModel = dtCV.fit(trainData)
val lrPrediction = lrCvModel.transform(testData)
val dtPrediction = dtCvModel.transform(testData)
4)查看数据
lrPrediction.select("label", "prediction").show(10)
dtPrediction.select("label", "prediction").show(10)
5)查看逻辑回归匹配模型的参数
val lrBestModel = lrCvModel.bestModel.asInstanceOf[PipelineModel]
val lrModel = lrBestModel.stages(1).asInstanceOf[LogisticRegressionModel]
lrModel.getRegParam
lrModel.getMaxIter
6)查看决策树匹配模型的参数
val dtBestModel = dtCvModel.bestModel.asInstanceOf[PipelineModel]
val dtModel = dtBestModel.stages(1).asInstanceOf[DecisionTreeClassificationModel]
dtModel.getMaxDepth
dtModel.numFeatures
7)统计逻辑回归的预测正确性
// t_lr 为逻辑回归预测值的数组
// t_dt 为决策树预测值的数组
// t_label 为测试集的标签值的数组
val (t_lr, t_dt, t_label, t_count) = (lrPrediction.select("prediction").collect, dtPrediction.select("label").collect, testData.select("label").collect, testData.count.toInt)
//c_lr 为统计逻辑回归正确个数的累加器
//c_dt 为统计决策树正确个数的累加器
var Array(c_lr, c_dt) = Array(0, 0)
//遍历循环,统计逻辑回归正确个数
for(i <- 0 to t_count-1){if(t_lr(i)==t_label(i)) c_lr+=1}
c_lr
//统计逻辑回归正确性
1.0*c_lr/t_count
//遍历循环,统计决策树正确个数
for(i <- 0 to t_count-1){if(t_dt(i)==t_label(i)) c_dt+=1}
c_dt
//统计决策树正确性
1.0*c_dt/t_count
spark-shell --num-executors 1 --total-executor-cores 3 --executor-memory 512m
val Path = "hdfs://uti04.utidata.com:8020/u01/bigdata/"
val raw_data = sc.textFile(Path + "data/train_noheader.tsv")
# raw_data.take(2)
val records = raw_data.map(line => line.split('\t'))
records.first
records.count
records.first.size
records.first.take(2)
8.4 数据预处理
import org.apache.spark.ml.feature.{LabeledPoint, IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.linalg.Vectors
# 这里注意:如果要整体复制,所有行之前都要消除空格或tab
val data = records.map{r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if(d=="?") 0.0 else d.toDouble)
LabeledPoint(label, Vectors.dense(features))
}
# 这里注意:如果要整体复制,所有行之前都要消除空格或tab
val nbData = records.map{r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size - 1).map(d => if(d=="?") 0.0 else d.toDouble).map(d => if(d<0) 0.0 else d)
LabeledPoint(label, Vectors.dense(features))
}
data.take(2)
6)通过rdd创建DataFrame
val df = spark.createDataFrame(data)
val nbDF = spark.createDataFrame(nbData)
7)
scala> df.show(10)
+-----+--------------------+
|label| features|
+-----+--------------------+
| 0.0|[0.789131,2.05555...|
| 1.0|[0.574147,3.67796...|
| 1.0|[0.996526,2.38288...|
| 1.0|[0.801248,1.54310...|
| 0.0|[0.719157,2.67647...|
| 0.0|[0.0,119.0,0.7454...|
| 1.0|[0.22111,0.773809...|
| 0.0|[0.0,1.883333333,...|
| 1.0|[0.0,0.471502591,...|
| 1.0|[0.0,2.41011236,0...|
+-----+--------------------+
# 查看nbDF的第一行数据
nbDF.head
nbDF.first
8)
scala> df.printSchema
root
|-- label: double (nullable = false)
|-- features: vector (nullable = true)
9)划分数据
val Array(trainData, testData) = df.randomSplit(Array(0.8, 0.2), seed = 1234L)
val Array(nbTrainData, nbTestData) = nbDF.randomSplit(Array(0.8, 0.2), seed = 1234L)
10)
trainData.count
testData.count
11)
trainData.cache
testData.cache
nbTrainData.cache
nbTestData.cache
12)
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}
13)创建贝叶斯模型,设置初始参数
val nb = new NaiveBayes().setLabelCol("label").setFeaturesCol("features")
14)通过朴素贝叶斯训练模型,对数据进行预测
# 训练
val nbModel = nb.fit(nbTrainData)
# 预测
val nbPrediction = nbModel.transform(nbTestData)
nbPrediction.show(10)
15)准确性统计
// t1存放预测值的数组,t2存放测试数据标签值,t3存放测试数据总行数
val (t1, t2, t3) = (nbPrediction.select("prediction").collect, nbTestData.select("label").collect, nbTestData.count.toInt)
// t4为累加器
var t4 = 0
// 循环遍历, 统计正确的预测次数
for(i <- 0 to t3-1){if(t1(i)==t2(i)) t4+=1}
// 查看预测正确的个数
t4
// 计算准确率
val nbAccuracy = 1.0 * t4 / t3
8.5 组装
1)导入特征索引类
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorIndexerModel}
2)建立特征索引
val featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
3)建立逻辑回归模型
val lr = new LogisticRegression().setLabelCol("label").setFeaturesCol("indexedFeatures").setMaxIter(10).setRegParam(0.001)
4)创建决策树模型
# setMaxBins离散化“连续特征”的最大划分数
val dt = new DecisionTreeClassifier().setLabelCol("label").setFeaturesCol("indexedFeatures").setImpurity("entropy").setMaxBins(100).setMaxDepth(5).setMinInfoGain(0.01)
5)导入网格参数和交叉验证
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
6)导入流水线
import org.apache.spark.ml.{Pipeline, PipelineModel}
7)导入评估器
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
8)配置两条流水线:一条是逻辑回归的流水线,包含连个stages( featureIndexer 和lr);一条是决策树回归的流水线,包含连个stages( featureIndexer 和dt)
val lrPipeline = new Pipeline().setStages(Array(featureIndexer, lr))
val dtPipeline = new Pipeline().setStages(Array(featureIndexer, dt))
8.6 模型优化
1)分别配置网格参数,使用ParamGridBuilder构造一个parameter grid,注意无空格复制
val lrParamGrid = new ParamGridBuilder().addGrid(lr.regParam, Array(0.1, 0.3, 0.5)).addGrid(lr.maxIter, Array(10, 20, 30)).build()
val dtParamGrid = new ParamGridBuilder().addGrid(dt.maxDepth, Array(3, 5, 7)).build()
2)分别实例化交叉验证模型
val evaluator = new BinaryClassificationEvaluator
val lrCV = new CrossValidator().setEstimator(lrPipeline).setEvaluator(evaluator).setEstimatorParamMaps(lrParamGrid).setNumFolds(2)
val dtCV = new CrossValidator().setEstimator(dtPipeline).setEvaluator(evaluator).setEstimatorParamMaps(dtParamGrid).setNumFolds(2)
3)通过交叉验证模型,获取最优参数集,并测试模型
val lrCvModel = lrCV.fit(trainData)
val dtCvModel = dtCV.fit(trainData)
val lrPrediction = lrCvModel.transform(testData)
val dtPrediction = dtCvModel.transform(testData)
4)查看数据
lrPrediction.select("label", "prediction").show(10)
dtPrediction.select("label", "prediction").show(10)
5)查看逻辑回归匹配模型的参数
val lrBestModel = lrCvModel.bestModel.asInstanceOf[PipelineModel]
val lrModel = lrBestModel.stages(1).asInstanceOf[LogisticRegressionModel]
lrModel.getRegParam
lrModel.getMaxIter
6)查看决策树匹配模型的参数
val dtBestModel = dtCvModel.bestModel.asInstanceOf[PipelineModel]
val dtModel = dtBestModel.stages(1).asInstanceOf[DecisionTreeClassificationModel]
dtModel.getMaxDepth
dtModel.numFeatures
7)统计逻辑回归的预测正确性
// t_lr 为逻辑回归预测值的数组
// t_dt 为决策树预测值的数组
// t_label 为测试集的标签值的数组
val (t_lr, t_dt, t_label, t_count) = (lrPrediction.select("prediction").collect, dtPrediction.select("label").collect, testData.select("label").collect, testData.count.toInt)
//c_lr 为统计逻辑回归正确个数的累加器
//c_dt 为统计决策树正确个数的累加器
var Array(c_lr, c_dt) = Array(0, 0)
//遍历循环,统计逻辑回归正确个数
for(i <- 0 to t_count-1){if(t_lr(i)==t_label(i)) c_lr+=1}
c_lr
//统计逻辑回归正确性
1.0*c_lr/t_count
//遍历循环,统计决策树正确个数
for(i <- 0 to t_count-1){if(t_dt(i)==t_label(i)) c_dt+=1}
c_dt
//统计决策树正确性
1.0*c_dt/t_count