在前面的文章中,用到了PCA,主成分分析,一种无监督降维的方法。我们来看看spark实现pca的代码。spark主要是用scala实现的,scala这门语言很奇特,spark rdd可读性还可以,但是spark sql抽象程度大大提升,阅读难度较大。好在spark sql的pca也是借助rdd实现的。
import org.apache.spark.mllib.feature
override def fit(dataset: Dataset[_]): PCAModel = {
transformSchema(dataset.schema, logging = true)
val input = dataset.select($(inputCol)).rdd.map {
case Row(v: Vector) => OldVectors.fromML(v)
}
val pca = new feature.PCA(k = $(k))
val pcaModel = pca.fit(input)
copyValues(new PCAModel(uid, pcaModel.pc.asML, pcaModel.explainedVariance.asML)
.setParent(this))
}
看到调用了mllib.feature.PCA
val mat = if (numFeatures > 65535) {
val summary = Statistics.colStats(sources.map((_, 1.0)), Seq("mean"))
val mean = Vectors.fromML(summary.mean)
val meanCenteredRdd = sources.map { row =>
BLAS.axpy(-1, mean, row)
row
}
new RowMatrix(meanCenteredRdd)
} else {
require(PCAUtil.memoryCost(k, numFeatures) < Int.MaxValue,
"The param k and numFeatures is too large for SVD computation. " +
"Try reducing the parameter k for PCA, or reduce the inpu