val df = spark.sql("select * from table")
//计算相关系数之前需要先填充缺失值、向量化val
feaFillZeroSeq = Seq("cnt_trs_cash","cnt_trs_cash_prop","sum_trs_cash","sum_trs_cash_prop","cnt_trs_atm","cnt_trs_atm_prop","sum_trs_atm","sum_trs_atm_prop","cnt_trs_kj","cnt_trs_kj_prop","sum_trs_kj","sum_trs_kj_prop","cnt_trs_auto","cnt_trs_auto_prop","sum_trs_auto","sum_trs_auto_prop","cnt_trs_inter_bank","cnt_trs_inter_bank_prop","sum_trs_inter_bank","sum_trs_inter_bank_prop")
val dfFill = df.select("cnt_trs_cash","cnt_trs_cash_prop","sum_trs_cash","sum_trs_cash_prop","cnt_trs_atm","cnt_trs_atm_prop","sum_trs_atm","sum_trs_atm_prop","cnt_trs_kj","cnt_trs_kj_prop","sum_trs_kj","sum_trs_kj_prop","cnt_trs_auto","cnt_trs_auto_prop","sum_trs_auto","sum_trs_auto_prop","cnt_trs_inter_bank","cnt_trs_inter_bank_prop","sum_trs_inter_bank","sum_trs_inter_bank_prop").na.fill(0.0, feaFillZeroSeq)
val arr = Array("cnt_trs_cash","cnt_trs_cash_prop","sum_trs_cash","sum_trs_cash_prop","cnt_trs_atm","cnt_trs_atm_prop","sum_trs_atm","sum_trs_atm_prop","cnt_trs_kj","cnt_trs_kj_prop","sum_trs_kj","sum_trs_kj_prop","cnt_trs_auto","cnt_trs_auto_prop","sum_trs_auto","sum_trs_auto_prop","cnt_trs_inter_bank","cnt_trs_inter_bank_prop","sum_trs_inter_bank","sum_trs_inter_bank_prop")
val assembler = new VectorAssembler(). setInputCols(arr). setOutputCol("features")
val output = assembler.transform(dfFill)
val Row(coeff1: Matrix) = Correlation.corr(output, "features").headval rowHeaders = arr
val colHeaders = arr
println("相关性矩阵:")
print("\t")
for (j <- 0 until colHeaders.length) {
print(f"${colHeaders(j)}%7s")
}
println()
for (i <- 0 until coeff1.numRows) {
print(f"${rowHeaders(i)}%5s")
for (j <- 0 until coeff1.numCols) {
val correlationValue = coeff1(i, j)
val formattedValue = s"%.2f".format(correlationValue)
print(f"$formattedValue%7s") }
println() }
复制到excel-数据-分列、开始-条件格式-色阶
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg._ object
spark_task1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]")
val spark = new SparkSession.Builder().config(conf).appName("task1").getOrCreate()
import spark.implicits._
// 创建数据集
val data = Seq( (80, 5, 15, 8), (50, 4, 12, 7), (70, 6, 14, 8), (90, 5, 16, 9), (60, 4, 11, 7), (85, 6, 18, 9) )
// 将数据转化为DataFrame
val df = data.toDF("咖啡因含量(mg)", "服务时间(分钟)", "咖啡价格(元)", "顾客满意度评分") // 使用VectorAssembler将特征列转化为向量列
val assembler = new VectorAssembler() .setInputCols(Array("咖啡因含量(mg)", "服务时间(分钟)", "咖啡价格(元)","顾客满意度评分")) .setOutputCol("features")
val output = assembler.transform(df)
// 计算相关性矩阵
val Row(coeff1: Matrix) = Correlation.corr(output, "features").head // 行和列的标题
val rowHeaders = Array("咖啡因含量(mg)", "服务时间(分钟)", "咖啡价格(元)","顾客满意度评分")
val colHeaders = Array("咖啡因含量(mg)", "服务时间(分钟)", "咖啡价格(元)","顾客满意度评分") // 打印相关性矩阵
println("相关性矩阵:")
// 打印列标题
print("\t")
for (j <- 0 until colHeaders.length) { print(f"${colHeaders(j)}%7s") } println()
// 打印矩阵内容和行标题
for (i <- 0 until coeff1.numRows) {
// 打印行标题
print(f"${rowHeaders(i)}%5s")
// 打印矩阵内容
for (j <- 0 until coeff1.numCols) {
val correlationValue = coeff1(i, j)
val formattedValue = s"%.2f".format(correlationValue)
// 格式化相关性值为两位小数
print(f"$formattedValue%7s")
// 打印并保持7个字符宽度
}
println()
// 换行
}
// 关闭SparkSession
spark.stop()
}
}