:数据集:UCI Machine Learning Repository's Wine data
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
:数据集内容 The attributes are
1 - fixed acidity
2 - volatile acidity
3 - citric acid
4 - residual sugar
5 - chlorides
6 - free sulfur dioxide
7 - total sulfur dioxide
8 - density
9 - pH
10 - sulphates
11 - alcohol
Output variable (based on sensory data):
12 - quality (score between 0 and 10)
:maven项目中添加spark csv(https://github.com/databricks/spark-csv)依赖
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.10</artifactId>
<version>1.4.0</version>
</dependency>
:spark shell启动命令添加spark csv依赖
bin/spark-shell --packages com.databricks:spark-csv_2.10:1.4.0
: 代码
package com.bbw5.ml.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.regression.LinearRegressionModel
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SQLContext
/**
* author:baibaw5
*/
object LinearRegression4Wine {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("LinearRegression4Wine")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val filename = "I:/DM-dataset/wine/winequality-red.csv"
val texts = sc.textFile(filename)
//load data
val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").option("delimiter", ";").load(filename)
//Summary stats
df.describe("fixed acidity", "pH", "alcohol", "quality").show
//cal corr
df.stat.corr("pH", "alcohol")
//transform
def convert(value: Any): Double = {
if (value.isInstanceOf[Double]) value.asInstanceOf[Double] else value.asInstanceOf[Int]
}
val rdd = df.map { row => (row.toSeq.take(11).map { x => convert(x) }, row.getInt(11))
}.map(a => Vectors.dense(a._1.toArray) -> a._2)
rdd.toDF("features", "label").show
//Let's scale the points. (LinearRegression.setStandardization already does it)
val scaler = new StandardScaler(withMean = true, withStd = true).fit(rdd.map(dp => dp._1))
val data = rdd.map(dp => scaler.transform(dp._1) -> dp._2.toDouble).toDF("features", "label")
data.show
//Create models
data.cache()
val Array(training, testing) = data.randomSplit(Array(0.8, 0.2), 1234L)
val lr = new LinearRegression().setSolver("l-bfgs")
val paramGrid = new ParamGridBuilder().
addGrid(lr.regParam, Array(0.0001, 0.01, 1.0)).
addGrid(lr.maxIter, Array(100)).
addGrid(lr.elasticNetParam, Array(0.1, 0.5, 1.0)).build()
// 80% of the data will be used for training and the remaining 20% for validation.
val cv = new CrossValidator().
setEstimator(lr).
setEvaluator(new RegressionEvaluator).
setEstimatorParamMaps(paramGrid).
setNumFolds(4)
// Run train validation split, and choose the best set of parameters.
val model = cv.fit(training)
//save model
model.save("D:/Develop/Model/WINE-LIR-" + System.currentTimeMillis())
//Calculate evaluation metrics
val bestModel = model.bestModel.asInstanceOf[LinearRegressionModel]
println("bestModel.params:" + bestModel.extractParamMap)
//show training metrics
val trainingSummary = bestModel.summary
println(s"Coefficients: ${bestModel.coefficients} Intercept: ${bestModel.intercept}")
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
trainingSummary.residuals.show()
println(s"train rmse: ${trainingSummary.rootMeanSquaredError}")
println(s"train r2: ${trainingSummary.r2}")
val testDF = model.transform(testing)
//show testing metrics
println("test rmse:" + new RegressionEvaluator().setMetricName("rmse").evaluate(testDF))
println("test mse:" + new RegressionEvaluator().setMetricName("mse").evaluate(testDF))
println("test r2:" + new RegressionEvaluator().setMetricName("r2").evaluate(testDF))
println("test mae:" + new RegressionEvaluator().setMetricName("mae").evaluate(testDF))
}
}
:参考资料
《Mastering Machine Learning with scikit-learn》
《Scala_Data_Analysis_Cookbook》