scala计算相关系数矩阵

val df = spark.sql("select * from table")

//计算相关系数之前需要先填充缺失值、向量化val 
feaFillZeroSeq = Seq("cnt_trs_cash","cnt_trs_cash_prop","sum_trs_cash","sum_trs_cash_prop","cnt_trs_atm","cnt_trs_atm_prop","sum_trs_atm","sum_trs_atm_prop","cnt_trs_kj","cnt_trs_kj_prop","sum_trs_kj","sum_trs_kj_prop","cnt_trs_auto","cnt_trs_auto_prop","sum_trs_auto","sum_trs_auto_prop","cnt_trs_inter_bank","cnt_trs_inter_bank_prop","sum_trs_inter_bank","sum_trs_inter_bank_prop")
val dfFill = df.select("cnt_trs_cash","cnt_trs_cash_prop","sum_trs_cash","sum_trs_cash_prop","cnt_trs_atm","cnt_trs_atm_prop","sum_trs_atm","sum_trs_atm_prop","cnt_trs_kj","cnt_trs_kj_prop","sum_trs_kj","sum_trs_kj_prop","cnt_trs_auto","cnt_trs_auto_prop","sum_trs_auto","sum_trs_auto_prop","cnt_trs_inter_bank","cnt_trs_inter_bank_prop","sum_trs_inter_bank","sum_trs_inter_bank_prop").na.fill(0.0, feaFillZeroSeq)
val arr = Array("cnt_trs_cash","cnt_trs_cash_prop","sum_trs_cash","sum_trs_cash_prop","cnt_trs_atm","cnt_trs_atm_prop","sum_trs_atm","sum_trs_atm_prop","cnt_trs_kj","cnt_trs_kj_prop","sum_trs_kj","sum_trs_kj_prop","cnt_trs_auto","cnt_trs_auto_prop","sum_trs_auto","sum_trs_auto_prop","cnt_trs_inter_bank","cnt_trs_inter_bank_prop","sum_trs_inter_bank","sum_trs_inter_bank_prop")
val assembler = new VectorAssembler().	setInputCols(arr).    setOutputCol("features")
val output = assembler.transform(dfFill)
val Row(coeff1: Matrix) = Correlation.corr(output, "features").headval rowHeaders = arr
val colHeaders = arr
println("相关性矩阵:")
print("\t")   
   for (j <- 0 until colHeaders.length) {        
      print(f"${colHeaders(j)}%7s")   
       }    
      println()
   for (i <- 0 until coeff1.numRows) {      
       print(f"${rowHeaders(i)}%5s")      
       for (j <- 0 until coeff1.numCols) {        
           val correlationValue = coeff1(i, j) 
           val formattedValue = s"%.2f".format(correlationValue) 
           print(f"$formattedValue%7s")        }      
           println()      }

复制到excel-数据-分列、开始-条件格式-色阶

import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg._ object 

spark_task1 {  
def main(args: Array[String]): Unit = {    
 val conf = new SparkConf().setMaster("local[*]")   
 val spark = new SparkSession.Builder().config(conf).appName("task1").getOrCreate()   
 import spark.implicits._     
 // 创建数据集   
 val data = Seq(      (80, 5, 15, 8),      (50, 4, 12, 7),      (70, 6, 14, 8),      (90, 5, 16, 9),      (60, 4, 11, 7),      (85, 6, 18, 9)    )     
// 将数据转化为DataFrame    
val df = data.toDF("咖啡因含量(mg)", "服务时间(分钟)", "咖啡价格(元)", "顾客满意度评分")     // 使用VectorAssembler将特征列转化为向量列    
val assembler = new VectorAssembler()      .setInputCols(Array("咖啡因含量(mg)", "服务时间(分钟)", "咖啡价格(元)","顾客满意度评分"))      .setOutputCol("features")     
val output = assembler.transform(df)     
// 计算相关性矩阵    
val Row(coeff1: Matrix) = Correlation.corr(output, "features").head     // 行和列的标题   
 val rowHeaders = Array("咖啡因含量(mg)", "服务时间(分钟)", "咖啡价格(元)","顾客满意度评分")    
val colHeaders = Array("咖啡因含量(mg)", "服务时间(分钟)", "咖啡价格(元)","顾客满意度评分")     // 打印相关性矩阵   
 println("相关性矩阵:")    
 // 打印列标题    
 print("\t")    
 for (j <- 0 until colHeaders.length) {      print(f"${colHeaders(j)}%7s")    }    println()     
 // 打印矩阵内容和行标题    
 for (i <- 0 until coeff1.numRows) {     
  // 打印行标题      
  print(f"${rowHeaders(i)}%5s")      
   // 打印矩阵内容      
   for (j <- 0 until coeff1.numCols) {        
   val correlationValue = coeff1(i, j)        
   val formattedValue = s"%.2f".format(correlationValue)  
   // 格式化相关性值为两位小数        
   print(f"$formattedValue%7s")  
   // 打印并保持7个字符宽度      
   }      
   println()  
   // 换行   
    }    
     // 关闭SparkSession    
     spark.stop()  
     }
     }
  • 8
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值