package mlib.demo.linearegression
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LinearRegressionWithSGD, LabeledPoint}
import org.apache.spark.{SparkContext, SparkConf}
/**
* Created by master on 3/1/18.
*/
object LineaRegression {
def main(args: Array[String]) {
//shiled not necessary info on the terminal
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//set run environment
val conf = new SparkConf().setAppName("LineaRegression").setMaster("local")
val sc = new SparkContext(conf)
// load data and parse
/*
1 -0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.86317118542594 5 -1.04215728919298 -0.864466507337306
2 -0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425 945 -1.04215728919298 -0.864466507337306
*/
val data = sc.textFile("/opt/data/data4linearegression")
val parsedData = data.map { line =>
var parts = line.split(",")
// character into dense vector
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
}
//construct model
val numIeterations = 100
//参数估计最大似然用的比较多,这里采用随机梯度下降法随机梯度下降法(Stochastic Gradient Descent,SGD,
//也作增量梯度下降法:Incremental Gradient Method, IGM)
//sgd解决了梯度下降的两个问题: 收敛速度慢和陷入局部最优
val model = LinearRegressionWithSGD.train(parsedData, numIeterations)
//judge model and compute errors
val valuesAndPreds = parsedData.map {
point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.reduce(_ + _) / valuesAndPreds.count()
println("trainning Mean Squared Error = " + MSE)
println(model.weights + "-------------------")
sc.stop()
}
}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LinearRegressionWithSGD, LabeledPoint}
import org.apache.spark.{SparkContext, SparkConf}
/**
* Created by master on 3/1/18.
*/
object LineaRegression {
def main(args: Array[String]) {
//shiled not necessary info on the terminal
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//set run environment
val conf = new SparkConf().setAppName("LineaRegression").setMaster("local")
val sc = new SparkContext(conf)
// load data and parse
/*
1 -0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.86317118542594 5 -1.04215728919298 -0.864466507337306
2 -0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425 945 -1.04215728919298 -0.864466507337306
*/
val data = sc.textFile("/opt/data/data4linearegression")
val parsedData = data.map { line =>
var parts = line.split(",")
// character into dense vector
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
}
//construct model
val numIeterations = 100
//参数估计最大似然用的比较多,这里采用随机梯度下降法随机梯度下降法(Stochastic Gradient Descent,SGD,
//也作增量梯度下降法:Incremental Gradient Method, IGM)
//sgd解决了梯度下降的两个问题: 收敛速度慢和陷入局部最优
val model = LinearRegressionWithSGD.train(parsedData, numIeterations)
//judge model and compute errors
val valuesAndPreds = parsedData.map {
point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.reduce(_ + _) / valuesAndPreds.count()
println("trainning Mean Squared Error = " + MSE)
println(model.weights + "-------------------")
sc.stop()
}
}