:源码下载地址
https://www.packtpub.com/big-data-and-business-intelligence/mastering-machine-learning-scikit-learn
:启动ipython notebook
cd E:\DM\bookcode\mastering-machine-learning-scikit-learn
ipython notebook
:python版预测pizza直径与价格的关系
from sklearn.linear_model import LinearRegression
# Training data
X = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]
# Create and fit the model
model = LinearRegression()
model.fit(X, y)
print 'A 12" pizza should cost: $%.2f' % model.predict([12])[0]
:spark版预测pizza直径与价格的关系
package com.bbw5.ml.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.{ LinearRegressionModel => LRModel }
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.PolynomialExpansion
/**
* Pizza price plotted against diameter
* X=Diameter in inches
* Y=Price in dollars
*/
object SparkLinearRegresionSample1 {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkLinearRegresionSample1")
val sc = new SparkContext(sparkConf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val X = Array(6, 8, 10, 14, 18)
val y = Array(7, 9, 13, 17.5, 18)
val X_test = Array(8, 9, 11, 16, 12)
val y_test = Array(11, 8.5, 15, 18, 11)
val training = sc.makeRDD(X.zip(y).map(a => Vectors.dense(a._1) -> a._2).toSeq).toDF("features", "label")
val testing = sc.makeRDD(X_test.zip(y_test).map(a => Vectors.dense(a._1) -> a._2).toSeq).toDF("features", "label")
//simple linear regression
train4ML(training, testing)
val X2 = Array((6, 2), (8, 1), (10, 0), (14, 2), (18, 0))
val y2 = Array((7), (9), (13), (17.5), (18))
val X2_test = Array((8, 2), (9, 0), (11, 2), (16, 2), (12, 0))
val y2_test = Array((11), (8.5), (15), (18), (11))
val training2 = sc.makeRDD(X2.zip(y2).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("features", "label")
val testing2 = sc.makeRDD(X2_test.zip(y2_test).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("features", "label")
//multiply linear regression
train4ML(training2, testing2)
//polynomial linear regression
val degree = 5
val training3 = sc.makeRDD(X2.zip(y2).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("lastFeatures", "label")
val testing3 = sc.makeRDD(X2_test.zip(y2_test).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("lastFeatures", "label")
val polynomialExpansion = new PolynomialExpansion().setInputCol("lastFeatures").setOutputCol("features").setDegree(degree)
val polyTraining = polynomialExpansion.transform(training3)
val polyTesting = polynomialExpansion.transform(testing3)
train4ML(polyTraining, polyTesting)
}
/**
* use ml api to train
*/
def train4ML(training: DataFrame, testing: DataFrame) {
val lr = new LinearRegression().setMaxIter(10).setRegParam(0.3)
val lrModel = lr.fit(training)
// Print the coefficients and intercept for linear regression
// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
trainingSummary.residuals.show()
println(s"train rmse: ${trainingSummary.rootMeanSquaredError}")
println(s"train r2: ${trainingSummary.r2}")
//println(s"train pValues: ${trainingSummary.pValues}")
//println(s"train tValues: ${trainingSummary.tValues}")
//predict
val predictDF = lrModel.transform(testing) //.select("features", "prediction")
println("A 8\" pizza should cost: " + predictDF.first().getAs("prediction"))
println("test rmse:" + new RegressionEvaluator().setMetricName("rmse").evaluate(predictDF))
println("test mse:" + new RegressionEvaluator().setMetricName("mse").evaluate(predictDF))
println("test r2:" + new RegressionEvaluator().setMetricName("r2").evaluate(predictDF))
println("test mae:" + new RegressionEvaluator().setMetricName("mae").evaluate(predictDF))
}
/**
* 使用新版ML API进行训练和预测
*/
def train4ML2(training: DataFrame) {
val evaluations =
for (
iter <- Array(1, 5, 10);
lambda <- Array(0.0001, 0.01, 1.0)
) yield {
val lr = new LinearRegression().setMaxIter(iter).setRegParam(lambda)
val lrModel = lr.fit(training);
val trainingSummary = lrModel.summary
((iter, lambda), trainingSummary.rootMeanSquaredError, trainingSummary.r2)
}
evaluations.sortBy(_._2).reverse.foreach(println)
}
def train4ML3(training: DataFrame, testing: DataFrame) {
}
def calculateMSE(testing: RDD[LabeledPoint], model: LRModel): Double = {
// Evaluate model on training examples and compute training error
val valuesAndPreds = testing.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}
valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
}
/**
* 使用MLLIB API进行训练和预测
* 通过12寸pizza的价格约等于$13.68来作为模型参数优劣判断
*/
def train4MLLib(sc: SparkContext) {
val diameters = Array(6, 8, 10, 14, 18.toDouble)
val prices = Array(7, 9, 13, 17.5, 18)
val training = sc.makeRDD(0 until diameters.size).map(i => LabeledPoint(prices(i), Vectors.dense(diameters(i))))
//采用默认参数训练stepSize=1
var model = LinearRegressionWithSGD.train(training, 100)
println(s"weight:${model.weights},intercept:${model.intercept}")
//效果非常差:A 12" pizza should cost: -8.001325844886018E135
println("A 12\" pizza should cost: " + model.predict(Vectors.dense(12)))
//选取MSE最小方案
val evaluations =
for (
numIterations <- Array(1, 5, 10, 100);
stepSize <- Array(0.0001, 0.01, 1.0);
miniBatchFraction <- Array(0.1, 0.5, 1.0)
) yield {
model = LinearRegressionWithSGD.train(training, numIterations, stepSize, miniBatchFraction)
((numIterations, stepSize, miniBatchFraction), calculateMSE(training, model))
}
//升序排序
evaluations.sortBy(_._2).foreach(println)
//取最优方案预测
val params = evaluations.sortBy(_._2).apply(0)._1
model = LinearRegressionWithSGD.train(training, params._1, params._2, params._3)
println(s"weight:${model.weights},intercept:${model.intercept}")
//A 12" pizza should cost: 13.544867556960533
println("A 12\" pizza should cost: " + model.predict(Vectors.dense(12)))
}
}