Spark MLlib之线性回归

13 篇文章 0 订阅
8 篇文章 0 订阅

spark mllib 线性回归实例:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.evaluation.RegressionMetrics;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.regression.LinearRegressionModel;
import org.apache.spark.mllib.regression.LinearRegressionWithSGD;

import scala.Tuple2;

public class SparkMLlibLinearRegression {

	public static void main(String[] args) {
		
		String path = "file:///data/hadoop/spark-2.0.0-bin-hadoop2.7/data/mllib/ridge-data/lpsa.data";
		SparkConf conf = new SparkConf();
	    JavaSparkContext sc = new JavaSparkContext(args[0], "Spark", conf);   
		 
	    JavaRDD<String> data = sc.textFile(path);
		JavaRDD<LabeledPoint> parsedData = data.map(new Function<String, LabeledPoint>() {
			@Override
			public LabeledPoint call(String line) throws Exception {
				String[] parts = line.split(",");
				String[] features = parts[1].split(" ");
				double[] v = new double[features.length];
				for (int i = 0; i < v.length; i++) {
					v[i] =  Double.parseDouble(features[i]);
				}
				return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
			}
		});
		parsedData.cache();
	
		// Building the model
		int numIterations = 100;
		double stepSize = 0.00000001;
		final LinearRegressionModel model =
		  LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize);

		// Evaluate model on training examples and compute training error
		JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(new Function<LabeledPoint, Tuple2<Double, Double>>(){
		@Override
		public Tuple2<Double, Double> call(LabeledPoint point)
					throws Exception {
			double prediction = model.predict(point.features());
			return new Tuple2<Double, Double>(prediction, point.label());
		}
			
		});
		 
		double MSE = new JavaDoubleRDD(valuesAndPreds.map(
		  new Function<Tuple2<Double, Double>, Object>() {
		    public Object call(Tuple2<Double, Double> pair) {
		      return Math.pow(pair._1() - pair._2(), 2.0);
		    }
		  }
		).rdd()).mean();
		
		System.out.println("training Mean Squared Error = " + MSE);
      
		//模型评测
		JavaRDD<Tuple2<Object, Object>>  valuesAndPreds2= parsedData.map(new Function<LabeledPoint, Tuple2<Object, Object>>(){
			@Override
			public Tuple2<Object, Object> call(LabeledPoint point)
						throws Exception {
				double prediction = model.predict(point.features());
				return new Tuple2<Object, Object>(prediction, point.label());
			}
				
			});
		 RegressionMetrics metrics = new RegressionMetrics(JavaRDD.toRDD(valuesAndPreds2));
	   System.out.println("R2(决定系数)= "+metrics.r2()); 
         System.out.println("MSE(均方差、方差) = "+metrics.meanSquaredError());
         System.out.println("RMSE(均方根、标准差) "+metrics.rootMeanSquaredError());
         System.out.println("MAE(平均绝对差值)= "+metrics.meanAbsoluteError());
         
         
		 
		// Save and load model
		model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel");
		LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
		  "target/tmp/javaLinearRegressionWithSGDModel");
		
         double result = sameModel.predict(Vectors.dense(-0.132431544081234,2.68769877553723,1.09253092365124,1.53428167116843,-0.522940888712441,-0.442797990776478,0.342627053981254,-0.687186906466865));
		 System.out.println(sameModel.weights());
         System.out.println("save predict result="+ result);
         
         result = model.predict(Vectors.dense(-0.132431544081234,2.68769877553723,1.09253092365124,1.53428167116843,-0.522940888712441,-0.442797990776478,0.342627053981254,-0.687186906466865));
		 System.out.println(model.weights());
         System.out.println("predict result="+ result);
	}

}


运行:spark-submit --class com.test.hadoop.SparkMLlibLinearRegression --master yarn --executor-memory 1024M --total-executor-cores 1 ./MRTest-1.0-jar-with-dependencies.jar  yarn


训练出来的model保存在了hdfs上target/tmp/javaLinearRegressionWithSGDModel,下次使用直接load后就可以用来做预测。如上面sameModel和model最后的结果相同。

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值