离线轻量级大数据平台Spark之MLib机器学习库SVM实例

支持向量机,因其英文名为support vector machine,故一般简称SVM,通俗来讲,它是一种二类分类模型,其基本模型定义为特征空间上的间隔最大的线性分类器,其学习策略便是间隔最大化,最终可转化为一个凸二次规划问题的求解。
http://www.dataguru.cn/thread-371987-1-1.html
参考该网站理解SVM基础数学原理。

依据距离计算分类的思想,适用于各维。

具体代码如下:

package sk.mlib;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;

import scala.Tuple2;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.SVMModel;
import org.apache.spark.mllib.classification.SVMWithSGD;
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.util.MLUtils;

public class SVMWithSGDDemo {
	public static void main(String[] args) {
	    SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample");
	    SparkContext sc = new SparkContext(conf);
	    // $example on$
	    String path = "/tmp/svmdata.txt";
	    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

	    // Split initial RDD into two... [60% training data, 40% testing data].
	    JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L);
	    training.cache();
	    JavaRDD<LabeledPoint> test = data.subtract(training);

	    // Run training algorithm to build the model.
	    int numIterations = 100;
	    final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations);

	    // Clear the default threshold.
	    model.clearThreshold();

	    // Compute raw scores on the test set.
	    JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
	      new Function<LabeledPoint, Tuple2<Object, Object>>() {
	        public Tuple2<Object, Object> call(LabeledPoint p) {
	          Double score = model.predict(p.features());
	          return new Tuple2<Object, Object>(score, p.label());
	        }
	      }
	    );

	    // Get evaluation metrics.
	    BinaryClassificationMetrics metrics =new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels));
	    double auROC = metrics.areaUnderROC();

	    System.out.println("Area under ROC = " + auROC);

	    // Save and load model
	    model.save(sc, "/tmp/javaSVMWithSGDModel");
	    SVMModel sameModel = SVMModel.load(sc, "/tmp/javaSVMWithSGDModel");
	    
	    //应用模型分类
	    System.out.println("Prediction of (-0.857554,0.555556,1,1,0.555556,0.333333,1,0.777778,0.333333,-1 ):"+sameModel.predict(Vectors.dense(-0.857554,0.555556,1,1,0.555556,0.333333,1,0.777778,0.333333,-1 )));
	    sc.stop();
	  }
}
/*
 执行结果:
 Area under ROC = 0.9017094017094017
 Prediction of (-0.857554,0.555556,1,1,0.555556,0.333333,1,0.777778,0.333333,-1 ):3.238535993736797
 */ 
输入的数据集:标签 特征向量1:特征向量值1 特征向量2:特征向量值2 ... 特征向量n:特征向量值n

0 1:-0.860107 2:-0.111111 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
0 1:-0.859671 2:-0.111111 3:-0.333333 4:-0.333333 5:-0.111111 6:0.333333 7:1 8:-0.555556 9:-0.777778 10:-1 
0 1:-0.857807 2:-0.555556 3:-1 4:-1 5:-1 6:-0.777778 7:-0.777778 8:-0.555556 9:-1 10:-1 
0 1:-0.85768 2:0.111111 3:0.555556 4:0.555556 5:-1 6:-0.555556 7:-0.333333 8:-0.555556 9:0.333333 10:-1 
0 1:-0.857569 2:-0.333333 3:-1 4:-1 5:-0.555556 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
1 1:-0.857554 2:0.555556 3:1 4:1 5:0.555556 6:0.333333 7:1 8:0.777778 9:0.333333 10:-1 
0 1:-0.857408 2:-1 3:-1 4:-1 5:-1 6:-0.777778 7:1 8:-0.555556 9:-1 10:-1 
0 1:-0.857339 2:-0.777778 3:-1 4:-0.777778 5:-1 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
1 1:-0.855171 2:-0.777778 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-1 9:-1 10:-0.111111 
0 1:-0.855171 2:-0.333333 3:-0.777778 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
0 1:-0.854841 2:-1 3:-1 4:-1 5:-1 6:-1 7:-1 8:-0.555556 9:-1 10:-1 
0 1:-0.854709 2:-0.777778 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
1 1:-0.853868 2:-0.111111 3:-0.555556 4:-0.555556 5:-0.555556 6:-0.777778 7:-0.555556 8:-0.333333 9:-0.333333 10:-1 
0 1:-0.85354 2:-1 3:-1 4:-1 5:-1 6:-0.777778 7:-0.555556 8:-0.555556 9:-1 10:-1 
1 1:-0.853454 2:0.555556 3:0.333333 4:-0.111111 5:1 6:0.333333 7:0.777778 8:-0.111111 9:-0.111111 10:-0.333333 
1 1:-0.852997 2:0.333333 3:-0.333333 4:0.111111 5:-0.333333 6:0.111111 7:-1 8:-0.333333 9:-0.555556 10:-1 
0 1:-0.852842 2:-0.333333 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
1 1:-0.852671 2:-0.333333 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
0 1:-0.852543 2:1 3:0.333333 4:0.333333 5:0.111111 6:-0.333333 7:1 8:-0.333333 9:-1 10:-0.777778 
0 1:-0.852536 2:0.111111 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
1 1:-0.851958 2:0.333333 3:-0.555556 4:-0.777778 5:1 6:-0.111111 7:1 8:-0.111111 9:-0.333333 10:-0.333333 
1 1:-0.851957 2:1 3:-0.111111 4:-0.111111 5:-0.555556 6:0.111111 7:0.333333 8:0.333333 9:1 10:-1 
0 1:-0.85163 2:-0.555556 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
0 1:-0.851217 2:-1 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
1 1:-0.850295 2:-0.111111 3:-0.777778 4:-0.555556 5:-0.333333 6:-0.777778 7:0.333333 8:-0.555556 9:0.111111 10:-1 
0 1:-0.850198 2:-0.555556 3:-0.777778 4:-1 5:-1 6:-1 7:-1 8:-0.777778 9:-1 10:-1 
0 1:-0.850107 2:-0.111111 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
0 1:-0.850038 2:-0.777778 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
0 1:-0.849517 2:-1 3:-1 4:-0.555556 5:-1 6:-0.777778 7:-1 8:-1 9:-1 10:-1 
0 1:-0.849517 2:-0.555556 3:-1 4:-1 5:-1 6:-1 7:-1 8:-0.777778 9:-1 10:-1 
0 1:-0.849393 2:-0.777778 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
1 1:-0.849331 2:1 3:0.333333 4:0.333333 5:-0.555556 6:0.555556 7:-0.111111 8:0.333333 9:-0.333333 10:-0.555556 
0 1:-0.848968 2:-0.777778 3:-1 4:-1 5:-0.777778 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
0 1:-0.848891 2:-0.555556 3:-1 4:-0.777778 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
0 1:-0.848267 2:-0.777778 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
1 1:-0.848135 2:1 3:1 4:1 5:0.555556 6:0.111111 7:-1 8:0.555556 9:0.777778 10:-1 
0 1:-0.847895 2:0.111111 3:-0.777778 4:-1 5:-1 6:-1 7:-1 8:0.333333 9:-1 10:-1 
1 1:-0.847478 2:-0.111111 3:-0.333333 4:-0.333333 5:0.777778 6:-0.777778 7:1 8:-0.111111 9:0.111111 10:-1 
1 1:-0.846481 2:-0.777778 3:-0.111111 4:-0.555556 5:-0.555556 6:0.111111 7:0.333333 8:0.333333 9:-0.111111 10:-1 
1 1:-0.845249 2:1 3:-0.333333 4:-0.555556 5:-1 6:-0.555556 7:-0.555556 8:0.111111 9:-0.111111 10:-0.777778 
1 1:-0.845097 2:0.111111 3:1 4:1 5:-0.777778 6:0.555556 7:1 8:0.333333 9:-0.555556 10:-0.555556 
1 1:-0.844791 2:-0.111111 3:0.111111 4:-0.111111 5:0.111111 6:1 7:-1 8:-0.555556 9:-1 10:-1 
1 1:-0.844637 2:1 3:1 4:1 5:-0.333333 6:0.555556 7:-1 8:0.555556 9:1 10:-1 
0 1:-0.84462 2:-1 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-0.777778 
1 1:-0.84439 2:-0.555556 3:0.333333 4:0.333333 5:-0.333333 6:-0.333333 7:0.777778 8:-0.333333 9:0.555556 10:-1 
0 1:-0.844351 2:-1 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
0 1:-0.844265 2:-0.333333 3:-1 4:-1 5:-0.555556 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 
1 1:-0.844156 2:0.333333 3:0.555556 4:0.333333 5:-0.777778 6:-0.333333 7:0.555556 8:-0.555556 9:0.555556 10:-0.777778 
1 1:-0.843926 2:0.777778 3:-0.111111 4:0.555556 5:-1 6:-0.777778 7:-0.555556 8:-0.777778 9:-1 10:-0.111111 
1 1:-0.843914 2:-0.111111 3:-0.555556 4:-0.555556 5:-0.333333 6:-0.777778 7:-0.333333 8:-0.555556 9:-0.333333 10:-1 
1 1:-0.843667 2:1 3:-0.555556 4:0.111111 5:-0.777778 6:-0.555556 7:-0.111111 8:-0.333333 9:1 10:-0.777778 
1 1:-0.843607 2:-0.111111 3:-0.111111 4:-0.111111 5:0.555556 6:1 7:0.555556 8:0.333333 9:-0.555556 10:0.333333 
1 1:-0.843604 2:1 3:-0.111111 4:-0.111111 5:0.111111 6:0.555556 7:0.555556 8:0.333333 9:-1 10:-1 
1 1:-0.843496 2:1 3:0.111111 4:0.111111 5:-0.555556 6:-0.333333 7:-0.111111 8:-0.555556 9:0.111111 10:-1 
1 1:-0.843352 2:0.555556 3:1 4:1 5:-1 6:-0.555556 7:0.111111 8:-0.555556 9:0.777778 10:-1 
1 1:-0.843228 2:0.555556 3:-0.777778 4:-0.333333 5:-1 6:-0.111111 7:-1 8:-0.111111 9:-0.333333 10:-0.333333 
1 1:-0.843162 2:-0.111111 3:-0.777778 4:-0.555556 5:-1 6:0.111111 7:1 8:-0.111111 9:-1 10:-1 
1 1:-0.843099 2:0.777778 3:-0.111111 4:-0.111111 5:-0.777778 6:-0.777778 7:-0.777778 8:-0.111111 9:-1 10:-1 
1 1:-0.842893 2:-0.111111 3:-0.555556 4:-0.111111 5:-0.111111 6:-0.555556 7:-0.555556 8:-0.333333 9:1 10:-1 
0 1:-0.842892 2:-1 3:-1 4:-1 5:-1 6:-0.777778 7:-0.777778 8:-0.777778 9:-1 10:-1 
1 1:-0.842769 2:0.777778 3:1 4:1 5:-1 6:1 7:0.555556 8:-0.555556 9:-0.555556 10:-1 
1 1:-0.842766 2:0.111111 3:-0.555556 4:-0.333333 5:-1 6:-0.111111 7:-0.777778 8:-0.555556 9:0.777778 10:-1 
0 1:-0.842757 2:-1 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.777778 9:-1 10:-1 
1 1:-0.842637 2:1 3:-0.333333 4:-0.777778 5:-1 6:-0.555556 7:-0.777778 8:-0.333333 9:-0.555556 10:1 
0 1:-0.842614 2:-0.333333 3:-1 4:-1 5:-1 6:-0.777778 7:-1 8:-0.555556 9:-1 10:-1 

执行结果和样本集有关系。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值