import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, functions}
import org.apache.spark.sql.types.{DoubleType, FloatType, StringType}
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.ml.feature.VectorAssembler
import scopt.OptionParser
import scala.collection.mutable
......
部分程序
......
val vectorAssembler = new VectorAssembler().setInputCols(featuresToModel.split("\\,",-1)).setOutputCol("features")
var data3 = vectorAssembler.transform(data2)
var importances = Array[Double]()
1 logisticRegression
LR模型公式:
方法:
model.coefficients 生成每个特征的系数 即公式中θi
model.intercept 生成节距 即公式中θ0
import org.apache.spark.ml.classification.{LogisticRegression, RandomForestClassifier}
if(model_select == "LogisticRegression"){
var max_iter: Int = 1000
var reg_param: Double = 0.001
val lr = new LogisticRegression()
lr.setMaxIter(max_iter)
.setRegParam(reg_param)
.setFamily("binomial")
.setProbabilityCol("predict_vector")
val arr=Array[String]("features","label")
var data4 = data3.select(arr.head,arr.tail:_*)
val model = lr.fit(data4)
// println(model.coefficients) 系数
//println(model.intercept) 截距
importances = model.coefficients.toArray
}
2 randomforest
import org.apache.spark.ml.classification.{LogisticRegression, RandomForestClassifier}
if (p.model_select == "RandomForest"){
var model = new RandomForestClassifier().setFeaturesCol("features").setLabelCol("label").setNumTrees(100)
model.setMaxDepth(5).setMaxBins(100)
val model1 = model.fit(data3)
importances= model1.featureImportances.toArray
}
3 xgboost
在xgboost4j 0.8版本 以前 我的版本是0.72
getFeatureScore()方法
将返回特征重要性映射(类型:Map [String,Integer]),其中键是特征索引(例如:f0,f1,f2 …)(特征索引与训练中的特征顺序相同)您用来训练模型的数据集),整数是特征分数(通过在训练模型中累积此特征的信息增益来计算)。
参考链接xgboost4j_0.72 API | xgboost4j_0.72 readthedocs
0.8版本以后
xgbModel.nativeBooster.getScore("", “gain”)
支持: ["gain, “cover”, “total_gain”, “total_cover”] 四种计算方式
参考链接xgboost4j_0.82 API | xgboost4j_0.82 readthedocs
import ml.dmlc.xgboost4j.scala.spark.{XGBoost}
if(p.model_select == "XGBoost"){
val xgbParam = Map(
"eta" -> 0.1,
"max_depth" -> 3,
"objective" -> "binary:logistic",
"colsample_bytree" -> 0.8,
"colsample_bylevel" -> 0.8,
"subsample" -> 0.8
)
var numRound = 100
var model = XGBoost.trainWithDataFrame(data3, xgbParam, round = numRound, nWorkers = 3, featureCol = "features", labelCol = "label")
var dataname = data3.columns.filter(_ != "label").filter(_ != "features")
val featureScoreMap = model.booster.getFeatureScore()
importances= mapToFeature(dataname,featureScoreMap)
}
def mapToFeature(dataname: Array[String], featureScoreMap: mutable.Map[String, Integer]): Array[Double] = {
// val FeatureScore = scala.collection.mutable.Map[String,Double]()
val namefeature =featureScoreMap.map(_._1).toArray
var importscore = ArrayBuffer[Double]()
for (i <- 0 until dataname.size){
var skey = s"f$i"
if (namefeature.contains(skey)){
// FeatureScore += (dataname(i) -> featureScoreMap(skey).toDouble)
importscore += featureScoreMap(skey).toDouble
}
else{
// FeatureScore += (dataname(i) -> 0.0)
importscore += 0.0
}
}
// println(FeatureScore)
importscore.toArray
}