package com.qh
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
/**
* Created by hadoop on 8/29/16.
* Spark 2.0.0
* Scala 2.11.8
* 分类算法 线性支持向量机SVM(Linear Support Vector Machines)
*/
object MLlib_SVM {
private val path = "hdfs://master:9000/Spark/MLlib/SVM"
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("MLlib SVM")
.setMaster("spark://master:7077")
val sc = new SparkContext(conf)
/*
数据为Saprk自带的数据源
LIBSVMData.collect()
scala> LIBSVMData.collect()
res0: Array[org.apache.spark.mllib.regression.LabeledPoint] = Array((0.0,(692,[127,128,129,
130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,
213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,
269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,
328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,
412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,
494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,
571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,
626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,2...
scala> SVMData.collect()
res1: Array[org.apache.spark.mllib.regression.LabeledPoint] = Array((1.0,[0.0,2.52078447201548,
0.0,0.0,0.0,2.004684436494304,2.000347299268466,0.0,2.228387042742021,2.228387042742023,0.0,0.0,
0.0,0.0,0.0,0.0]), (0.0,[2.857738033247042,0.0,0.0,2.619965104088255,0.0,2.004684436494304,
2.000347299268466,0.0,2.228387042742021,2.228387042742023,0.0,0.0,0.0,0.0,0.0,0.0]), (0.0,
[2.857738033247042,0.0,2.061393766919624,0.0,0.0,2.004684436494304,0.0,0.0,2.228387042742021,
2.228387042742023,0.0,0.0,0.0,0.0,0.0,0.0]), (1.0,[0.0,0.0,2.061393766919624,2.619965104088255,0.0,
2.004684436494304,2.000347299268466,0.0,0.0,0.0,0.0,2.055002875864414,0.0,0.0,0.0,0.0]), (1.0,
[2.857738033247042,0.0,2.061393766919624,2.619965104088255,0.0,2.004684436494304,0.0,0.0,0.0,0.0,0.0,
2.055002875864414,0.0,0.0,0.0,0.0]), (...
*/
val LIBSVMData = MLUtils.loadLibSVMFile(sc, path + "/LIBSVMData.txt")
val data = sc.textFile(path + "/SVMData.txt")
val SVMData = data.map { line =>
val parts = line.split("\\s+")
LabeledPoint(parts(0).toDouble, Vectors.dense(parts.tail.map(_.toDouble)))
}
/*
用数据的60%建立模型
40%数据进行测试
def randomSplit(weights: Array[Double], seed: Long = Utils.random.nextLong): Array[RDD[T]]
根据weights权重,将一个RDD切分成多个RDD
Spark最重要的一个功能,就是在不同操作间,持久化(或缓存)一个数据集在内存中
cache和persist就是用来实现着一功能
1)RDD的cache()方法其实调用的就是persist方法,缓存策略均为MEMORY_ONLY
2)可以通过persist方法手工设定StorageLevel来满足工程需要的存储级别
3)cache或者persist并不是action
*/
val parsedDataLib = LIBSVMData.randomSplit(Array(0.6, 0.4))
val TrainDataLib = parsedDataLib(0).cache()
val TestDataLib = parsedDataLib(1)
val parsedData = SVMData.randomSplit(Array(0.6, 0.4))
val TrainData = parsedData(0).cache()
val TestData = parsedData(1)
/*
向量标签的类型:
LabeledPoint(label: Double, features: Vector)
def train(input: RDD[LabeledPoint], numIterations: Int, stepSize: Double, regParam: Double,
miniBatchFraction: Double, initialWeights: Vector): SVMModel
input:样本数据,分类标签lable只能是1.0和0.0两种,feature为double类型
numIterations:梯度下降的迭代次数 默认为1.0
stepSize:用于梯度下降的每一次迭代的步长 默认为100
regParam:正则化参数 默认值为0.0
miniBatchFraction:每次迭代中使用的数据的分数 默认为1.0
initialWeights:要使用的初始权重集。数组的大小应与数据中的功能数相等。
*/
var model = SVMWithSGD.train(TrainDataLib, 100)
model.clearThreshold()
var scoreAndLabels = TestDataLib.map(x => (model.predict(x.features), x.label))
scoreAndLabels.saveAsTextFile(path + "/LIBSVMData")
/*
将模型保存 以后使用可以直接加载模型数据进行测试
*/
model.save(sc, path + "/LIBSVMDataModel")
// 加载model
var sameModel = SVMModel.load(sc, path + "/LIBSVMDataModel")
// 用来验证加载的model测试的数据是否正确 与LIBSVMData 数据对比
scoreAndLabels = TestDataLib.map(x => (sameModel.predict(x.features), x.label))
scoreAndLabels.saveAsTextFile(path + "/LIBSVMDataCom")
model = SVMWithSGD.train(TrainData, 100)
model.clearThreshold()
scoreAndLabels = TestData.map(x => (model.predict(x.features), x.label))
scoreAndLabels.saveAsTextFile(path + "/SVMData")
model.save(sc, path + "/SVMDataModel")
sameModel = SVMModel.load(sc, path + "/SVMDataModel")
scoreAndLabels = TestDataLib.map(x => (sameModel.predict(x.features), x.label))
scoreAndLabels.saveAsTextFile(path + "/SVMDataCom")
sc.stop()
}
}
Spark MLlib(二)SVM
最新推荐文章于 2021-09-23 23:46:25 发布