###### 计算信息增益（Information Gain），考虑交叉feature


import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

/**
*
* @author qibaoyuan
*
*/
public class InformationGain {

/**
* calculate the info(entrophy from a list of classes)
*
* @param classes
*            字符类型的分类信息
* @return info entropy
*/
static Double calculateEntrophy(List<String> classes) {
Double info = 0.0;
try {
// 总的个数
int size = classes.size();

// map to store the count of each unique class
Map<String, Integer> counter = new HashMap<String, Integer>();

// iter all the class
for (String key : classes) {
if (counter.containsKey(key.trim()))
counter.put(key.trim(), counter.get(key.trim()) + 1);
else
// set 1
counter.put(key.trim(), 1);
}

// iter the map
for (Entry<String, Integer> entry : counter.entrySet()) {
Double ratio = Double.parseDouble(Integer.toString((entry
.getValue()))) / size;
info -= ratio * (Math.log(ratio) / Math.log(2));
}
} catch (Exception e) {
e.printStackTrace();
}
return info;
}

/**
*
* @param records
*            输入记录 example:{[我 n 1 0 0 0 0 0 YES],[是 n 0 0 0 0 0 0 NO]}
* @return
*/
static Map<Integer, Double> calculateIG(List<String[]> records,
Boolean isSingleFeature) {
Map<Integer, Double> index4select = new HashMap<Integer, Double>();
try {
// 1.计算总的info
List<String> labels = new ArrayList<String>();
int feature_size = 0;
for (String[] arr : records) {
String label = arr[arr.length - 1];
feature_size = arr.length - 1;

}

Map<Integer, List<Object>> features = PermutationTest.genPerLess(
feature_size, 3);

Double total = calculateEntrophy(labels);
System.out.print("label的熵信息：");
System.out.println(total);

// 2.计算每个feature的entrophy
// int i=0;
for (Entry<Integer, List<Object>> entry1 : features.entrySet()) {

Double info_i = 0.0;

Map<String, List<String>> featureMap = new HashMap<String, List<String>>();

// divide the records according to the feature
for (String[] arr : records) {

// get the feature
String feature = "";
if (entry1.getValue().size() > 1 && isSingleFeature)
continue;
for (Object obj : entry1.getValue()) {
if (obj instanceof Integer)
feature += arr[(Integer) obj];
}

// check whether if it's counted
if (featureMap.containsKey(feature)) {
List<String> featureList = featureMap.get(feature);
featureMap.put(feature, featureList);
} else {
List<String> featureList = new ArrayList<String>();
featureMap.put(feature, featureList);
}

}

// calculate entrophy of each value of the feature
for (Entry<String, List<String>> entry : featureMap.entrySet()) {

Double score = calculateEntrophy(entry.getValue());

info_i += (Double.parseDouble(Integer.toString(entry
.getValue().size())) / records.size()) * score;
}

System.out.print("feature " + entry1.getKey() + " ig:");
System.out.println(total - info_i);

// ig=f-total
index4select.put(entry1.getKey(), total - info_i);
}

// ///sort by the value
ArrayList<Integer> keys = new ArrayList<Integer>(
index4select.keySet());// 得到key集合
final Map<Integer, Double> scoreMap_temp = index4select;
Collections.sort(keys, new Comparator<Object>() {

public int compare(Object o1, Object o2) {

if (Double.parseDouble(scoreMap_temp.get(o1).toString()) < Double
.parseDouble(scoreMap_temp.get(o2).toString()))
return 1;

if (Double.parseDouble(scoreMap_temp.get(o1).toString()) == Double
.parseDouble(scoreMap_temp.get(o2).toString()))
return 0;

else
return -1;
}
});

int y = 0;

for (Integer key : keys) {
System.out.println(key + "" + features.get(key) + "= "
+ scoreMap_temp.get(key));
}
// //////////////////////

} catch (Exception e) {
e.printStackTrace();
}
return index4select;
}

/**
* 从文件读入输入，计算每个feature的ig，最後一列是手工標註的label
*
* @param file
*            存放手工标注语料的路径
*/
static void calculateIG(String file) {
try {
String line = null;
List<String[]> lists = new ArrayList<String[]>();
while ((line = br.readLine()) != null) {
if (line.trim().length() == 0)
continue;
}
System.out.print(calculateIG(lists,false));
} catch (Exception e) {
e.printStackTrace();
}
}

/**
* @param args
*/
public static void main(String[] args) {
calculateIG("/home/qibaoyuan/qibaoyuan/lexo/cv/all.txt");
}

}


#### Information Gain 计算 python实现

2012-06-10 16:06:38

#### 什么是信息增益(Information Gain)

2016-03-03 21:03:56

#### 信息增益，信息增益率，Gini

2014-03-22 12:11:35

#### 特征选择方法-信息增益

2015-10-27 16:01:08

#### 特征选择之信息增益法

2015-11-18 14:11:58

#### 机器学习中信息增益的计算方法

2018-03-14 14:43:32

#### 决策树之信息增益

2014-09-15 16:51:07

#### 数据挖掘笔记-特征选择-算法实现-1

2014-07-17 18:32:07

#### 关于信息增益和熵

2015-12-14 21:15:25

#### 信息增益（information gain）

2014-03-10 20:35:54