-------------------------------------------------------------------------------------
笔者追求算法实现,不喜欢大篇幅叙述原理,有关LDA(线性判别分析)理论推荐查看该篇博客
https://www.cnblogs.com/pinard/p/6244265.html
-------------------------------------------------------------------------------------
import breeze.linalg.DenseMatrix
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{LabeledPoint,VectorAssembler}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ArrayBuffer
/** The method is Linear discriminant analysis which can be used to
* lower the dimension of linear dataset
* Data Source :http://archive.ics.uci.edu/ml/datasets/Wine
* @author XiaoTangBao
* @date 2019/4/24 10:32
* @version 1.0
*/
object LDA {
def main(args: Array[String]): Unit = {
//屏蔽日志
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//spark初始化
val spark = SparkSession.builder().master("local[4]").appName("LDA").getOrCreate()
//获取数据源 http://archive.ics.uci.edu/ml/datasets/Wine
val data = spark.sparkContext.textFile("G:\\mldata\\wine.data").map(line => line.split(","))
.map(arr => arr.map(str => str.toDouble)).map(arr =>Row(arr(0),arr(1),arr(2),arr(3),arr(4),arr(5),
arr(6),arr(7),arr(8),arr(9),arr(10),arr(11),arr(12),arr(13)))
//设置featuresArr和schema,便于后期数据转化及生成dataFrame
val featuresArr = Array("Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium",
"Total phenols","Flavanoids","Nonflavanoid phenols","Proanthocyanins","Color intensity",
"Hue","OD280/OD315 of diluted wines","Proline")
val schema = StructType(List(StructField("label",DoubleType,true),StructField("Alcohol",DoubleType,true),StructField("Malic acid",DoubleType,true),
StructField("Ash",DoubleType,true),StructField("Alcalinity of ash",DoubleType,true),StructField("Magnesium",DoubleType,true)
,StructField("Total phenols",DoubleType,true),StructField("Flavanoids",DoubleType,true),StructField("Nonflavanoid phenols",DoubleType,true)
,StructField("Proanthocyanins",DoubleType,true),StructField("Color intensity",DoubleType,true),StructField("Hue",DoubleType,true)
,StructField("OD280/OD315 of diluted wines",DoubleType,true),StructField("Proline",DoubleType,true)))
val oridf = spark.createDataFrame(data,schema)
//设置转化器
val vectorAsb = new VectorAssembler().setInputCols(featuresArr).setOutputCol("features")
//数据整理后传入run,启动LDA算法
val newdf = vectorAsb.transform(oridf).select("label","features")
val rpg = run(newdf,2)
val arr = ArrayBuffer[(Double,Double)]()
for(i<-0 until rpg.cols) arr.append((rpg(0,i),rpg(1,i)))
arr.foreach(tp =>println(tp._1))
println()
arr.foreach(tp =>println(tp._2))
}
/**
* Entrance of modeltraining
* @param df trainData with only two columns such as label and features
* @param nb the dimensions of the traindata after
*/
def run(df:DataFrame,nb:Int)={
//数据标准化处理
// val stdf = new StandardScaler().setInputCol("features").setOutputCol("Scaledfeatures")
// .setWithMean(true).setWithStd(true).fit(df).transform(df)
// .select("label","Scaledfeatures")
// .withColumnRenamed("Scaledfeatures","features")
val trainData = df.select("features").rdd.map(row => row.toString())
.map(str => str.replace('[',' '))
.map(str => str.replace(']',' '))
.map(str => str.trim).map(str => str.split(','))
.map(arr => arr.map(str => str.toDouble)).collect()
val labels = df.select("label").rdd.map(row => row.toString())
.map(str => str.replace('[',' '))
.map(str => str.replace(']',' '))
.map(str => str.trim).map(str => str.toDouble).collect()
//特征列数
val tz = trainData(0).length
//生成新的带label的数据
val labArr = ArrayBuffer[LabeledPoint]()
for(i<-0 until trainData.length) labArr.append(LabeledPoint(labels(i),Vectors.dense(trainData(i))))
//总样本组成的大型矩阵
val allData = labArr.map(lab => lab.features).map(vec => vec.toArray).flatMap(x => x).toArray
val big_Matrx =new DenseMatrix[Double](tz,trainData.length,allData)
import breeze.linalg._
//存放向量各维度的均值
val big_mean = sum(big_Matrx,Axis._1).*= (1.0 / big_Matrx.cols)
//总的类别
val allLabel = labels.distinct
//类内散度矩阵
val Sw_Arr = ArrayBuffer[DenseMatrix[Double]]()
//类间散度矩阵
val Sb_Arr = ArrayBuffer[DenseMatrix[Double]]()
for(i<-0 until allLabel.length){
//该类别下的总记录数
val record = labArr.filter(lab => lab.label == allLabel(i)).size
val sk = labArr.filter(lab => lab.label == allLabel(i)).map(lab => lab.features)
.map(vec => vec.toArray).flatMap(x => x).toArray
var d1 = new DenseMatrix[Double](tz,record,sk)
//存放向量各维度的均值
val cols_mean = sum(d1,Axis._1).*= (1.0 / d1.cols)
//样本去中心化
for(i<-0 until d1.cols){
d1(::,i) := d1(::,i) - cols_mean
}
//类内散度矩阵
val sw = d1 * (d1.t)
Sw_Arr.append(sw)
//类间散度矩阵
val zf = (cols_mean - big_mean).toDenseMatrix.t
val sb = record.toDouble * zf * zf.t
Sb_Arr.append(sb)
}
//总类内散度矩阵
var total_Sw = DenseMatrix.zeros[Double](tz,tz)
for(i<-0 until Sw_Arr.length) total_Sw = total_Sw + Sw_Arr(i)
//总类间散度矩阵
var total_Sb = DenseMatrix.zeros[Double](tz,tz)
for(i<-0 until Sb_Arr.length) total_Sb = total_Sb + Sb_Arr(i)
//计算类内散度和类间散度矩阵乘积
val Sw_Sb = inv(total_Sw) * total_Sb
//计算Sw_Sb矩阵特征值及特征向量
val eigValues = eig(Sw_Sb).eigenvalues
val eigVectors = eig(Sw_Sb).eigenvectors
//测试结果表明,特征向量为单列向量,一列代表的才是一个特征向量,所以之前的理解是错的
//选取最大的k个特征值对应的特征向量
val label_eig = DenseMatrix.horzcat(eigVectors.t,eigValues.toDenseMatrix.t)
var strArr = ArrayBuffer[String]()
for(i<-0 until label_eig.rows) strArr.append(label_eig.t(::,i).toString)
for(i<-0 until strArr.length){
strArr(i) = strArr(i).replace("DenseVector(","").replace(')',' ').trim()
}
val da = ArrayBuffer[LabeledPoint]()
for(str <- strArr){
val arr = str.split(',').map(string => string.toDouble)
val lab = arr.takeRight(1)(0)
val value = arr.take(arr.length -1)
val labPoint = LabeledPoint(lab,Vectors.dense(value))
da.append(labPoint)
}
//rt表示最终选取的特征向量矩阵
val result = da.sortBy(labPoint => labPoint.label).reverse.take(nb).map(lab => lab.features).map(vec => vec.toArray)
var rt = DenseMatrix.zeros[Double](result.length,result(0).length)
for(i<-0 until rt.rows){
for(j<-0 until rt.cols){
rt(i,j) = result(i)(j)
}
}
//降维后的数据集
val lastData = rt * big_Matrx
lastData
}
}
根据实验结果数据绘制图像如下图所示:
该结果与Python 直接调取LDA方法结果相差较大(横纵坐标不一致):
数据标准化处理后,发现两者结果(横纵坐标 + 形状)完全一致。
实验结果表明,笔者的算法完全正确。