一、环境配置
1.spark2.1.0-cdh5.7.0(自编译)
2.cdh5.7.0
3.scala2.11.8
4.centos6.4
二、环境准备
参考https://blog.csdn.net/u010886217/article/details/90312617
三、实现代码
1.测试集iris.data描述
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
...
2.PCA代码
package sparktest
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.{DecisionTreeClassifier, NaiveBayes}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{PCA, VectorAssembler}
import org.apache.spark.sql.SparkSession
import scala.util.Random
object pca {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("iris")
val spark = SparkSession.builder().config(conf).getOrCreate()
spark.sparkContext.setLogLevel("WARN") ///日志级别
val file = spark.read.format("csv").load("iris.data")
//file.show()
import spark.implicits._
val random = new Random()
val data = file.map(row =>{
val label = row.getString(4) match {
case "Iris-setosa" => 0
case "Iris-versicolor" => 1
case "Iris-virginica" => 2
}
(row.getString(0).toDouble,
row.getString(1).toDouble,
row.getString(2).toDouble,
row.getString(3).toDouble,
label,
random.nextDouble())
}).toDF("_c0","_c1","_c2","_c3","label","rand").sort("rand")//.where("label = 1 or label = 0")
val assembler = new VectorAssembler().setInputCols(Array("_c0","_c1","_c2","_c3")).setOutputCol("features")
val pca = new PCA()
.setInputCol("features")
.setOutputCol("features2")
.setK(3)
val dataset = assembler.transform(data)
val pcaModel = pca.fit(dataset)
val dataset2 = pcaModel.transform(dataset)
val Array(train,test) = dataset2.randomSplit(Array(0.8,0.2))
val dt = new DecisionTreeClassifier().setFeaturesCol("features2").setLabelCol("label")
val model = dt.fit(train)
val result = model.transform(test)
result.show(false)
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(result)
println(s"""accuracy is $accuracy""")
}
}