【Mastering Machine Learning with scikit-learn (python+spark版)】Chapter2 Linear Regression

:源码下载地址

https://www.packtpub.com/big-data-and-business-intelligence/mastering-machine-learning-scikit-learn

:启动ipython notebook

cd E:\DM\bookcode\mastering-machine-learning-scikit-learn

ipython notebook

:python版预测pizza直径与价格的关系

from sklearn.linear_model import LinearRegression
# Training data
X = [[6], [8], [10], [14],   [18]]
y = [[7], [9], [13], [17.5], [18]]
# Create and fit the model
model = LinearRegression()
model.fit(X, y)
print 'A 12" pizza should cost: $%.2f' % model.predict([12])[0]

:spark版预测pizza直径与价格的关系

package com.bbw5.ml.spark


import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.{ LinearRegressionModel => LRModel }
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.PolynomialExpansion


/**
 * Pizza price plotted against diameter
 * X=Diameter in inches
 * Y=Price in dollars
 */
object SparkLinearRegresionSample1 {
  def main(args: Array[String]) {


    val sparkConf = new SparkConf().setAppName("SparkLinearRegresionSample1")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)


    import sqlContext.implicits._


    val X = Array(6, 8, 10, 14, 18)
    val y = Array(7, 9, 13, 17.5, 18)
    val X_test = Array(8, 9, 11, 16, 12)
    val y_test = Array(11, 8.5, 15, 18, 11)


    val training = sc.makeRDD(X.zip(y).map(a => Vectors.dense(a._1) -> a._2).toSeq).toDF("features", "label")
    val testing = sc.makeRDD(X_test.zip(y_test).map(a => Vectors.dense(a._1) -> a._2).toSeq).toDF("features", "label")
    //simple linear regression
    train4ML(training, testing)


    val X2 = Array((6, 2), (8, 1), (10, 0), (14, 2), (18, 0))
    val y2 = Array((7), (9), (13), (17.5), (18))
    val X2_test = Array((8, 2), (9, 0), (11, 2), (16, 2), (12, 0))
    val y2_test = Array((11), (8.5), (15), (18), (11))


    val training2 = sc.makeRDD(X2.zip(y2).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("features", "label")
    val testing2 = sc.makeRDD(X2_test.zip(y2_test).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("features", "label")
    //multiply linear regression
    train4ML(training2, testing2)


    //polynomial  linear regression
    val degree = 5
    val training3 = sc.makeRDD(X2.zip(y2).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("lastFeatures", "label")
    val testing3 = sc.makeRDD(X2_test.zip(y2_test).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("lastFeatures", "label")
    val polynomialExpansion = new PolynomialExpansion().setInputCol("lastFeatures").setOutputCol("features").setDegree(degree)
    val polyTraining = polynomialExpansion.transform(training3)
    val polyTesting = polynomialExpansion.transform(testing3)
    train4ML(polyTraining, polyTesting)
  }


  /**
   * use ml api to train
   */
  def train4ML(training: DataFrame, testing: DataFrame) {
    val lr = new LinearRegression().setMaxIter(10).setRegParam(0.3)


    val lrModel = lr.fit(training)
    // Print the coefficients and intercept for linear regression


    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary


    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
    trainingSummary.residuals.show()
    println(s"train rmse: ${trainingSummary.rootMeanSquaredError}")
    println(s"train r2: ${trainingSummary.r2}")
    //println(s"train pValues: ${trainingSummary.pValues}")
    //println(s"train tValues: ${trainingSummary.tValues}")


    //predict
    val predictDF = lrModel.transform(testing) //.select("features", "prediction")


    println("A 8\" pizza should cost: " + predictDF.first().getAs("prediction"))
    println("test rmse:" + new RegressionEvaluator().setMetricName("rmse").evaluate(predictDF))
    println("test mse:" + new RegressionEvaluator().setMetricName("mse").evaluate(predictDF))
    println("test r2:" + new RegressionEvaluator().setMetricName("r2").evaluate(predictDF))
    println("test mae:" + new RegressionEvaluator().setMetricName("mae").evaluate(predictDF))


  }


  /**
   * 使用新版ML API进行训练和预测
   */
  def train4ML2(training: DataFrame) {
    val evaluations =
      for (
        iter <- Array(1, 5, 10);
        lambda <- Array(0.0001, 0.01, 1.0)
      ) yield {
        val lr = new LinearRegression().setMaxIter(iter).setRegParam(lambda)
        val lrModel = lr.fit(training);
        val trainingSummary = lrModel.summary
        ((iter, lambda), trainingSummary.rootMeanSquaredError, trainingSummary.r2)
      }
    evaluations.sortBy(_._2).reverse.foreach(println)
  }


  def train4ML3(training: DataFrame, testing: DataFrame) {


  }
  def calculateMSE(testing: RDD[LabeledPoint], model: LRModel): Double = {
    // Evaluate model on training examples and compute training error
    val valuesAndPreds = testing.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
  }


  /**
   * 使用MLLIB API进行训练和预测
   * 通过12寸pizza的价格约等于$13.68来作为模型参数优劣判断
   */
  def train4MLLib(sc: SparkContext) {
    val diameters = Array(6, 8, 10, 14, 18.toDouble)
    val prices = Array(7, 9, 13, 17.5, 18)
    val training = sc.makeRDD(0 until diameters.size).map(i => LabeledPoint(prices(i), Vectors.dense(diameters(i))))
    //采用默认参数训练stepSize=1
    var model = LinearRegressionWithSGD.train(training, 100)
    println(s"weight:${model.weights},intercept:${model.intercept}")
    //效果非常差:A 12" pizza should cost: -8.001325844886018E135
    println("A 12\" pizza should cost: " + model.predict(Vectors.dense(12)))


    //选取MSE最小方案
    val evaluations =
      for (
        numIterations <- Array(1, 5, 10, 100);
        stepSize <- Array(0.0001, 0.01, 1.0);
        miniBatchFraction <- Array(0.1, 0.5, 1.0)
      ) yield {
        model = LinearRegressionWithSGD.train(training, numIterations, stepSize, miniBatchFraction)
        ((numIterations, stepSize, miniBatchFraction), calculateMSE(training, model))
      }
    //升序排序
    evaluations.sortBy(_._2).foreach(println)


    //取最优方案预测
    val params = evaluations.sortBy(_._2).apply(0)._1
    model = LinearRegressionWithSGD.train(training, params._1, params._2, params._3)
    println(s"weight:${model.weights},intercept:${model.intercept}")
    //A 12" pizza should cost: 13.544867556960533
    println("A 12\" pizza should cost: " + model.predict(Vectors.dense(12)))
  }


}



Mastering Machine Learning with scikit-learn - Second Edition by Gavin Hackeling English | 24 July 2017 | ASIN: B06ZYRPFMZ | ISBN: 1783988363 | 254 Pages | AZW3 | 5.17 MB Key Features Master popular machine learning models including k-nearest neighbors, random forests, logistic regression, k-means, naive Bayes, and artificial neural networks Learn how to build and evaluate performance of efficient models using scikit-learn Practical guide to master your basics and learn from real life applications of machine learning Book Description Machine learning is the buzzword bringing computer science and statistics together to build smart and efficient models. Using powerful algorithms and techniques offered by machine learning you can automate any analytical model. This book examines a variety of machine learning models including popular machine learning algorithms such as k-nearest neighbors, logistic regression, naive Bayes, k-means, decision trees, and artificial neural networks. It discusses data preprocessing, hyperparameter optimization, and ensemble methods. You will build systems that classify documents, recognize images, detect ads, and more. You will learn to use scikit-learn's API to extract features from categorical variables, text and images; evaluate model performance, and develop an intuition for how to improve your model's performance. By the end of this book, you will master all required concepts of scikit-learn to build efficient models at work to carry out advanced tasks with the practical approach. What you will learn Review fundamental concepts such as bias and variance Extract features from categorical variables, text, and images Predict the values of continuous variables using linear regression and K Nearest Neighbors Classify documents and images using logistic regression and support vector machines Create ensembles of estimators using bagging and boosting techniques Discover hidden structures in data using K-Means clustering Evaluate the performance of machine learning systems in common tasks About the Author Gavin Hackeling is a data scientist and author. He was worked on a variety of machine learning problems, including automatic speech recognition, document classification, object recognition, and semantic segmentation. An alumnus of the University of North Carolina and New York University, he lives in Brooklyn with his wife and cat. Table of Contents The Fundamentals of Machine Learning Simple linear regression Classification and Regression with K Nearest Neighbors Feature Extraction and Preprocessing From Simple Regression to Multiple Regression From Linear Regression to Logistic Regression Naive Bayes Nonlinear Classification and Regression with Decision Trees From Decision Trees to Random Forests, and other Ensemble Methods The Perceptron From the Perceptron to Support Vector Machines From the Perceptron to Artificial Neural Networks Clustering with K-Means Dimensionality Reduction with Principal Component Analysis
Big data – that was our motivation to explore the world of machine learning with Spark a couple of years ago. We wanted to build machine learning applications that would leverag models trained on large amounts of data, but the beginning was not easy. Spark was still evolving, it did not contain a powerful machine learning library, and we were still trying to figure out what it means to build a machine learning application. But, step by step, we started to explore different corners of the Spark ecosystem and followed Spark’s evolution. For us, the crucial part was a powerful machine learning library, which would provide features such as R or Python libraries did. This was an easy task for us, since we are actively involved in the development of H2O’s machine learning library and its branch called Sparkling Water, which enables the use of the H2O library from Spark applications. However, model training is just the tip of the machine learning iceberg. We still had to explore how to connect Sparkling Water to Spark RDDs, DataFrames, and DataSets, how to connect Spark to different data sources and read data, or how to export models and reuse them in different applications. During our journey, Spark evolved as well. Originally, being a pure Scala project, it started to expose Python and, later, R interfaces. It also took its Spark API on a long journey from low-level RDDs to a high-level DataSet, exposing a SQL-like interface. Furthermore, Spark also introduced the concept of machine learning pipelines, adopted from the scikit-learn library known from Python. All these improvements made Spark a great tool for data transformation and data processing. Based on this experience, we decided to share our knowledge with the rest of the world via this book. Its intention is simple: to demonstrate different aspects of building Spark machine learning applications on examples, and show how to use not only the latest Spark features, but also low-level Spark interfaces. On our journey
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值