支持向量机:
package com.spark.milib
import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object SVMTest {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("svm").setMaster("local[4]")
val sc=new SparkContext(conf)
//读取样本数据,格式为 libsvm
//val data = sparkSession.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val data: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc,"data/mllib/sample_libsvm_data.txt")
//样本数据划分为训练样本和测试样本
//val splits: Array[Dataset[Row]] = data.randomSplit(Array(0.6,0.4),11L)
val splits= data.randomSplit(Array(0.6,0.4), seed =11L)
val training =splits(0).cache()
//val training: Dataset[Row] = splits(0).cache()
val test=splits(1)
//创建逻辑回归模型并训练
val numIterations=100//迭代次数
val model = SVMWithSGD.train(training,numIterations)
//对测试样本进行测试
val predictionAndLabel=test.map {point=>
val score=model.predict(point.features)
(score,point.label)
}
val print_predict=predictionAndLabel.take(20)
// 打印输出结果
println("prediction"+"\t"+"label")
for (i<- 0 to print_predict.length - 1)
{
println(print_predict(i)._1+"\t"+print_predict(i)._2)
}
//误差计算
val accuracy =1.0 *predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
println("Area under Roc = "+accuracy)
//保存模型
val ModelPath="/home/hadoop/test/svm/svm_model"
model.save(sc,ModelPath)
}
}
决策树
package com.spark.milib
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.sql.SparkSession
/**
* 决策树
*/
object StringIndexerExample {
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().appName("test").master("local[4]").getOrCreate()
// Load the data stored in LIBSVM format as a DataFrame.
val data = sparkSession.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
.fit(data)
// Automatically identify categorical features, and index them.
val featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
.setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
.fit(data)
// Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a DecisionTree model.
val dt = new DecisionTreeClassifier()
.setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures")
// Convert indexed labels back to original labels.
val labelConverter = new IndexToString()
.setInputCol("prediction")
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels)
// Chain indexers and tree in a Pipeline.
val pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))
// Train model. This also runs the indexers.
val model = pipeline.fit(trainingData)
// Make predictions.
val predictions = model.transform(testData)
// Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)
// Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test Error = " + (1.0 - accuracy))
val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
println("Learned classification tree model:\n" + treeModel.toDebugString)
}
}
随机森林
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// scalastyle:off println
package com.spark.milib
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
// $example off$
object RandomForestClassificationExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("RandomForestClassificationExample").setMaster("local[4]")
val sc = new SparkContext(conf)
// $example on$
// Load and parse the data file.
val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
// Split the data into training and test sets (30% held out for testing)
val splits = data.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))
// Train a RandomForest model.
// Empty categoricalFeaturesInfo indicates all features are continuous.
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val numTrees = 3 // Use more in practice.
val featureSubsetStrategy = "auto" // Let the algorithm choose.
val impurity = "gini"
val maxDepth = 4
val maxBins = 32
val model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
// Evaluate model on test instances and compute test error
val labelAndPreds = testData.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
println("Test Error = " + testErr)
println("Learned classification forest model:\n" + model.toDebugString)
// Save and load model
model.save(sc, "target/tmp/myRandomForestClassificationModel")
val sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
// $example off$
}
}
// scalastyle:on println
朴素贝叶斯:
package com.spark.milib
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.SparkSession
/**
* 朴素贝叶斯
*/
object NaiveBayesExample {
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().master("local[4]").appName("test").getOrCreate()
// Load the data stored in LIBSVM format as a DataFrame.
val data = sparkSession.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Split the data into training and test sets (30% held out for testing)
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1234L)
// Train a NaiveBayes model.
val model = new NaiveBayes()
.fit(trainingData)
// Select example rows to display.
val predictions = model.transform(testData)
predictions.show()
// Select (prediction, true label) and compute test error
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("Test set accuracy = " + accuracy)
}
}
线性回归:
package com.spark.milib
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.SparkSession
/**
* 线性回归
*/
object LinearRegressionExample {
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().appName("test").master("local[4]").getOrCreate()
// Load training data
val training = sparkSession.read.format("libsvm")
.load("data/mllib/sample_linear_regression_data.txt")
val lr = new LinearRegression()
.setMaxIter(10)
.setRegParam(0.3)
.setElasticNetParam(0.8)
// Fit the model
val lrModel = lr.fit(training)
// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
}
}
逻辑回归:
package com.spark.milib
import org.apache.spark
import org.apache.spark.sql.SparkSession
/**
* 逻辑回归
*/
object CorrelationExample {
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().master("local[4]").appName("test").getOrCreate()
import org.apache.spark.ml.classification.LogisticRegression
// Load training data
val training = sparkSession.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.3)
.setElasticNetParam(0.8)
// Fit the model
val lrModel = lr.fit(training)
// Print the coefficients and intercept for logistic regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
// We can also use the multinomial family for binary classification
val mlr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.3)
.setElasticNetParam(0.8)
.setFamily("multinomial")
val mlrModel = mlr.fit(training)
// Print the coefficients and intercepts for logistic regression with multinomial family
println(s"Multinomial coefficients: ${mlrModel.coefficientMatrix}")
println(s"Multinomial intercepts: ${mlrModel.interceptVector}")
}
}
kmeans聚类
package com.spark.milib
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.sql.SparkSession
/**
* kmeans聚类
*/
object KMeansExample {
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().master("local[4]").appName("test").getOrCreate()
// Loads data.
val dataset = sparkSession.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
// Trains a k-means model.
val kmeans = new KMeans().setK(2).setSeed(1L)
val model = kmeans.fit(dataset)
// Evaluate clustering by computing Within Set Sum of Squared Errors.
val WSSSE = model.computeCost(dataset)
println(s"Within Set Sum of Squared Errors = $WSSSE")
// Shows the result.
println("Cluster Centers: ")
model.clusterCenters.foreach(println)
}
}
LDA
package com.spark.milib
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.sql.SparkSession
/**
* LDA
*/
object LDAExample {
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().appName("test").master("local[4]").getOrCreate()
// Loads data.
val dataset = sparkSession.read.format("libsvm")
.load("data/mllib/sample_lda_libsvm_data.txt")
// Trains a LDA model.
val lda = new LDA().setK(10).setMaxIter(10)
val model = lda.fit(dataset)
val ll = model.logLikelihood(dataset)
val lp = model.logPerplexity(dataset)
println(s"The lower bound on the log likelihood of the entire corpus: $ll")
println(s"The upper bound bound on perplexity: $lp")
// Describe topics.
val topics = model.describeTopics(3)
println("The topics described by their top-weighted terms:")
topics.show(false)
// Shows the result.
val transformed = model.transform(dataset)
transformed.show(false)
}
}
提示:
以上使用到的所有数据均位于spark源码中data目录下