找了下,没找到现成的代码,自己写了个,速度还能接受,可以用来做简单的特征选择
import org.apache.spark.rdd.RDD
class FeatureSelect {
//信息熵
def entropyRDD(data:RDD[String])={
val size=data.count()
val p=data.map(x=>(x,1)).reduceByKey(_+_).map{
case(value,num)=>num.toDouble/size
}
p.map{x=>
-x*(Math.log(x)/Math.log(2))
}.sum
}
//条件熵 data:label,feature
def conditionalEntropy(data:RDD[(String,String)])={
val size=data.count()
data.map{case(label,feature)=>((feature,label),1)
}.reduceByKey(_+_).map{case((feature,label),num)=>
(feature,List((label,num))) //feature,label,cnt
}.reduceByKey(_:::_).mapValues{x=>
val size_entro=x.map(_._2).sum
val res=x.map(_._2.toDouble/size_entro).map{t=>
-t*(Math.log(t)/Math.log(2))
}.sum
size_entro*res
}.mapValues{x=>x/size}.map(_._2).sum
}
//信息增益 data:label,feature
def infoGain(data:RDD[(String,String)])={
entropyRDD(data.map(_._1))-conditionalEntropy(data)
}
//信息增益率 data:label,feature
def infoRatio(data:RDD[(String,String)])={
infoGain(data)/entropyRDD(data.map(_._2))
}
}