package com.dream.ml.regression
import org.apache.spark.ml.linalg
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.storage.StorageLevel
/**
* @title: LrBostonRegression
* @projectName SparkStudy
* @description:
* 使用线性回归算法对波士顿房价数据集构建回归模型,评估模型性能
* @author MXH
* @date 2023/9/4 11:35
*/
object LrBostonRegression {
def main(args: Array[String]): Unit = {
// 1.创建SparkSQL运行环境
val spark: SparkSession = {
SparkSession
.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[4]")
.config("spark.sql.shuffle.partitions","4")
.getOrCreate()
}
// 导入隐式转换
import spark.implicits._
// 2.加载数据
val bostonOriginDF: Dataset[String] = spark.read.textFile(path = "datas/housing/housing.data")
// 数据清洗
val bostonPriceDF: Dataset[String] = bostonOriginDF.filter(line => {
null != line && line.trim.split("\\s+").length == 14
})
// 3. 获取特征features和标签label
val bostonDF: DataFrame = bostonPriceDF.mapPartitions(iter => {
iter.map { line =>
val parts: Array[String] = line.trim.split("\\s+")
// 获取标签
val label: Double = parts(parts.length - 1).toDouble
// 获取特征features
val values: Array[Double] = parts.dropRight(1).map(_.toDouble)
val features: linalg.Vector = Vectors.dense(values)
// 返回二元组
(features, label)
}
})
.toDF("features", "label")
/*
root
|-- features: vector (nullable = true)
|-- label: double (nullable = false)
+----------------------------------------------------------------------------+-----+
|features |label|
+----------------------------------------------------------------------------+-----+
|[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98] |24.0 |
*/
bostonDF.printSchema()
bostonDF.show(10,truncate = false)
// 4.特征数据规范化(归一化、标准化、正则化等)
// 由于线性回归算法,默认情况下,对特征features数据进行标准化处理转换,所以此处不再进行处理
// 划分数据集为训练数据集和测试数据集
val Array(trainingDF,testingDF) = bostonDF.randomSplit(weights = Array(0.8,0.2), seed = 123L)
//训练数据集触发缓存
testingDF.persist(StorageLevel.MEMORY_AND_DISK).count()
// 5.创建模型
val lr: LinearRegression = new LinearRegression()
// 设置标签列名称
.setLabelCol("label")
// 设置特征列名称
.setFeaturesCol("features")
// 设置超参数
.setStandardization(true) //特征是否标准化处理,默认true
.setMaxIter(20) // 迭代次数,默认100
.setSolver("auto") // 设置用于优化的求解器算法 值包括"l-bfgs", "normal" and "auto",默认auto
.setRegParam(0.0) // 正则化参数 Set the regularization(正则化) parameter. Default is 0.0
//.setRegParam(0.5) // 正则化参数 Set the regularization(正则化) parameter. Default is 0.0
.setElasticNetParam(0.0) // 弹性化参数 Set the ElasticNet mixing parameter. Default is 0.0 which is an L2 penalty.
//.setElasticNetParam(0.5) // 弹性化参数 Set the ElasticNet mixing parameter. Default is 0.0 which is an L2 penalty.
// 6.训练模型
val lrModel: LinearRegressionModel = lr.fit(trainingDF)
// 7.评估模型
// Print the coefficients and intercept for linear regression(打印线性回归的系数和截距)
// Coefficients: 斜率,k Intercept:截距,b
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
val trainingSummary = lrModel.summary
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
// 8. 模型预测
lrModel.transform(testingDF).show(10,truncate = false)
// 9.模型保存
val modelPath = s"datas/housing/lr-model-" + System.nanoTime()
lrModel.save(modelPath)
val model: LinearRegressionModel = LinearRegressionModel.load(modelPath)
model.transform(testingDF).show(10,truncate = false)
// 关闭环境
spark.stop()
}
}
Spark线性回归算法-波士顿房价预测
最新推荐文章于 2023-10-09 17:28:22 发布