spark ml基于dataframe的线性回归
官网:https://spark.apache.org/docs/2.4.5/ml-guide.html
1、使用线性回归预测boston房价
package com.yyds.tags.ml.regression
import org.apache.spark.ml.linalg
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.storage.StorageLevel
/**
* 使用线性回归预测boston房价
*/
object BostonRegression {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[4]")
.config("spark.sql.shuffle.partitions",4)
.getOrCreate()
import spark.implicits._
// TODO step1 -> 读取数据
val houseDS: Dataset[String] = spark.read
.textFile("datas/housing/housing.data")
.filter(line => null != line && line.trim.split("\\s+").length == 14)
val frame: DataFrame = houseDS.mapPartitions {
iter =>
iter.map {
line =>
val parts = line.trim.split("\\s+")
// 获取标签
val label: Double = parts(parts.length - 1).toDouble
// 获取特征
val features: linalg.Vector = Vectors.dense(parts.dropRight(1).map(_.toDouble))
(features, label)
}
}.toDF("features", "label")
frame.printSchema()
frame.show(10,truncate = false)
// TODO step2 -> 划分训练集
val Array(trainingDF,testingDF) = frame.randomSplit(Array(0.8,0.2))
trainingDF.persist(StorageLevel.MEMORY_AND_DISK).count() // 触发
// TODO step3 -> 训练模型
val lr = new LinearRegression()
// 设置特征列名称 和 标签列名称
.setFeaturesCol("features")
.setLabelCol("label")
.setStandardization(true) // 标准化
// 设置超参数
.setMaxIter(20) // 迭代次数
.setSolver("auto") // 设置正规方程 还是 拟牛顿法l-bfgs
val lrModel = lr.fit(trainingDF)
// TODO step4 -> 模型评估
println("coefficients = " + lrModel.coefficients)
println("intercept = " + lrModel.intercept)
val trainingSummary = lrModel.summary
val rootMeanSquaredError = trainingSummary.rootMeanSquaredError
println("rootMeanSquaredError = " + rootMeanSquaredError)
// TODO step5 -> 模型预测
lrModel.transform(testingDF).show(10,truncate = false)
// 休眠
Thread.sleep(10000000L)
spark.stop()
}
}
2、读取鸢尾花数据集数据,封装特征值features 和 标签处理label
package com.yyds.tags.ml.features
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{StandardScaler, StringIndexer, VectorAssembler}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructType}
/**
* 读取鸢尾花数据集数据,封装特征值features 和 标签处理label
*/
object IrisFeaturesDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[4]")
.config("spark.sql.shuffle.partitions",4)
.getOrCreate()
import spark.implicits._
// TODO step1 -> 读取数据
val isrsSchema: StructType = new StructType()
.add("sepal_length",DoubleType,nullable = true)
.add("sepal_width",DoubleType,nullable = true)
.add("petal_length",DoubleType,nullable = true)
.add("petal_width",DoubleType,nullable = true)
.add("category",StringType, nullable = true)
val isrsDF: DataFrame = spark.read
.option("sep",",")
// 当首行不是列名称时候,需要自动设置schema
.option("header","false")
.option("inferSchema","false")
.schema(isrsSchema)
.csv("datas/iris/iris.data")
isrsDF.printSchema()
isrsDF.show(10,truncate = false)
// TODO step2 -> 封装特征
val assembler: VectorAssembler = new VectorAssembler()
.setInputCols(isrsDF.columns.dropRight(1))
.setOutputCol("features")
val df = assembler.transform(isrsDF)
// df1.printSchema()
// df1.show(10,truncate = false)
// TODO step3 -> 标签数值化
val indexer = new StringIndexer()
.setInputCol("category") // 对哪一列进行索引化
.setOutputCol("label") // 数据索引化后的列名称
val indexedDF: DataFrame = indexer
.fit(df)
.transform(df)
indexedDF.printSchema()
indexedDF.show(10,truncate = false)
// TODO step4 -> 特征归一化、正则化、标准化
val scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaled_features")
.setWithStd(true)
.setWithMean(false)
// Compute summary statistics by fitting the StandardScaler.
val scalerModel = scaler.fit(indexedDF)
// Normalize each feature to have unit standard deviation.
val scaledDataDF: DataFrame = scalerModel.transform(indexedDF)
scaledDataDF.printSchema()
scaledDataDF.show(100,truncate = false)
// TODO step5 -> 选择分类算法,构建分类模型
val lr: LogisticRegression = new LogisticRegression()
// 设置特征列名称 和 标签列名称
.setFeaturesCol("scaled_features")
.setLabelCol("label")
// 设置超参数
.setMaxIter(20) // 迭代次数
.setStandardization(false)
.setFamily("multinomial") // 二分类 还是 多分类
// TODO step6 -> 训练模型
val lrModel = lr.fit(scaledDataDF)
// TODO step7 -> 评估模型
println(s"多分类混淆矩阵: ${lrModel.coefficientMatrix}")
// 休眠
Thread.sleep(10000000L)
spark.stop()
}
}