用Spark MLlib进行数据挖掘

spark官网有很多关于机器学习的例子及详细说明。链接:MLlib: Main Guide - Spark 3.2.1 Documentation (apache.org)

K-means(命令行方式)

cd spark-3.1.1-bin-hadoop2.7/bin/
spark-shell

之后依次输入:

import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
val dataset =spark.read.format("libsvm").load("file:///home/ZQ/spark-3.1.1-bin-hadoop2.7/data/mllib/sample_kmeans_data.txt")
val kmeans = new KMeans().setK(2).setSeed(1L)
val model = kmeans.fit(dataset)
val predictions = model.transform(dataset)
val evaluator = new ClusteringEvaluator()
val silhouette = evaluator.evaluate(predictions)
println(s"Silhouette with squared euclidean distance = $silhouette")
println("Cluster Centers: ")
model.clusterCenters.foreach(println)

多项逻辑回归(命令行方式)

依次输入以下代码:

import org.apache.spark.ml.classification.LogisticRegression
val training = spark.read.format("libsvm").load("file:///home/ZQ/spark-3.1.1-bin-hadoop2.7/data/mllib//sample_multiclass_classification_data.txt")
val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
val lr=new LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
val lrModel = lr.fit(training)
println(s"Coefficients: \n${lrModel.coefficientMatrix}")
println(s"Intercepts: \n${lrModel.interceptVector}")
val trainingSummary = lrModel.summary
val objectiveHistory = trainingSummary.objectiveHistory
println("objectiveHistory:")
objectiveHistory.foreach(println)
println("False positive rate by label:")
trainingSummary.falsePositiveRateByLabel.zipWithIndex.foreach { case (rate,label) =>println(s"label $label: $rate")}
println("True positive rate by label:")
trainingSummary.truePositiveRateByLabel.zipWithIndex.foreach { case (rate, label)
=>println(s"label $label: $rate")
}
println("Precision by label:")
trainingSummary.precisionByLabel.zipWithIndex.foreach println("Recall by label:"){ case (prec, label) =>
     println(s"label $label: $prec")
}
println("Recall by label:")
trainingSummary.recallByLabel.zipWithIndex.foreach { case (rec, label) =>
     println(s"label $label: $rec")
     }
println("F-measure by label:")
trainingSummary.fMeasureByLabel.zipWithIndex.foreach { case (f, label) =>
     println(s"label $label: $f")
      }
val accuracy = trainingSummary.accuracy
val falsePositiveRate = trainingSummary.weightedFalsePositiveRate
val truePositiveRate = trainingSummary.weightedTruePositiveRate
val fMeasure = trainingSummary.weightedFMeasure
val precision = trainingSummary.weightedPrecision
val recall = trainingSummary.weightedRecall
println(s"Accuracy: $accuracy\nFPR: $falsePositiveRate\nTPR: $truePositiveRate\n" + s"F-measure: $fMeasure\nPrecision: $precision\nRecall: $recall")

随机森林(sbt编译打包方式)

1、创建文件夹randomforest

cd spark-3.1.1-bin-hadoop2.7/Test/sparkmllib/
 mkdir randomforest

2、在randomforest文件夹下递归创建目录:mkdir -p src/main/scala

cd randomforest/
mkdir -p src/main/scala

  

3、在randomforest下创建simple.sbt如下 

vim simple.sbt

在simple.sbt文件中增加如下内容:

name := "Simple Project"
version := "1.6.1"
scalaVersion := "2.12.10"
libraryDependencies += "org.apache.spark" %% "spark-core" % "3.1.1"
libraryDependencies += "org.apache.spark" %% "spark-mllib" % "3.1.1"

4、在scala目录下创建randomforest.scala文件

cd src/main/scala/
vim randomforest.scala

在文件下增加如下内容:

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{RandomForestClassificationModel,
RandomForestClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer,
VectorIndexer}
// $example off$
import org.apache.spark.sql.SparkSession
object RandomForestClassifierExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("RandomForestClassifierExample")
.getOrCreate()
// $example on$
// Load and parse the data file, converting it to a DataFrame.
val data =
spark.read.format("libsvm").load("file:///home/ZQ/spark-3.1.1-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
.fit(data)
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
val featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
.setMaxCategories(4)
.fit(data)
// Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a RandomForest model.
val rf = new RandomForestClassifier()
.setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures")
.setNumTrees(10)
// Convert indexed labels back to original labels.
val labelConverter = new IndexToString()
.setInputCol("prediction")
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labelsArray(0))
// Chain indexers and forest in a Pipeline.
val pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
// Train model. This also runs the indexers.
val model = pipeline.fit(trainingData)
// Make predictions.
val predictions = model.transform(testData)
// Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)
// Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println(s"Test Error = ${(1.0 - accuracy)}")
val rfModel =
model.stages(2).asInstanceOf[RandomForestClassificationModel]
println(s"Learned classification forest model:\n ${rfModel.toDebugString}")
// $example off$
spark.stop()
}
}

5、打包(randomforest目录下进行)

cd ../../..
/home/ZQ/sbt/sbt package

 

6、运行(randomforest目录下进行)

 /home/ZQ/spark-3.1.1-bin-hadoop2.7/bin/spark-submit --class "RandomForestClassifierExample" ./target/scala-2.12/simple-project_2.12-1.6.1.jar

  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值