Jianxin Wu在他的文章《Compact Representation for Image Classification: To Choose or to Compress?》中提出了一种用于特征筛选的方法,即互信息(Mutual Information)法。
其Publication主页 http://cs.nju.edu.cn/wujx/publication.htm 上给出了具体的Matlab实现代码和论文PDF。
博主由于最近需要利用MI方法来筛选特征,而Matlab版本的代码运行速度太慢,故特意编写了C++的mex代码,并给出了Matlab中对应位置的注释。经验证无错误。
Jianxin Wu编写的Matlab代码如下:
(本博客系原创,转载请注明出处:http://blog.csdn.net/xuexiyanjiusheng/article/details/46882447)
function [entX,entXY] = entropy(dinsts, labels, bins)
% Written by Jianxin Wu (Jianxin Wu, wujx2001@gmail.com)
% % Given the data in 'dinsts', where each row is one example
% and the 'labels' (integers 1, 2, ..., C, where C is the number of categories)
% One parameter 'bins' is needed:
% case '-2': quantize dinst into 2-bits, then compute MI
% case '-1': quantize dinst into 1-bit, then compute MI
% else: be a positive integer, uniform quantization
% Output:
% entX: the entropy of each feature dimension
% entXY: the mutual information between each dimension and the labels
% % You may want to change the thresholds for bins = -1 (1-bit) or -2 (2-bits)
% e.g., if your data is non-negative, current threshold values in X will not work
% they are designed for FV / VLAD features (which are [-1 +1])
% % Even if your data are [-1 +1], you may want to change the threshold +/- 0.0125 to
% a value that fits your dataset, e.g., 0.1- and 0.9-quantile分位数
[~, dim] = size(dinsts);%number of columns
nr_class = max(labels); % we assume labels are 1, 2, 3, ...
if bins == -2 % this is the case for quantized 2-bit version
X = [-1 -0.0125 0 0.0125 1];
elseif bins == -1 % 1-bit version
X = [-1 0 1];
else
vmin = min(min(dinsts));
vmax = max(max(dinsts));
X = vmin:((vmax-vmin)/bins):vmax+1e-6;
end
bins = length(X)-1;
probX = zeros(bins,dim);
probXY = zeros(bins*nr_class,dim);
for i = 1:dim
if mod(i,1000)==0
fprintf('.');
end
if mod(i,100000)==0
fprintf('\n');
end
temp = dinsts(:,i);%i-th column
frequency = histc(temp, X);
frequency = frequency / sum(frequency);
probX(:,i) = frequency(1:end-1);
this = zeros(1,nr_class*bins);
for j = 1:nr_class
frequency = histc(temp(labels==j), X);
this((j-1)*bins+1:j*bins) = frequency(1:end-1);
end
this = this / sum(this);
probXY(:,i) = this;
end
%fprintf('\n');
entX = -sum(probX.*log2(probX+1.0e-6));
entXY = -sum(probXY.*log2(probXY+1.0e-6));
博主编写的对应C++代码如下:
//written by Panfeng Li(Panfeng Li,panfengli@hust.edu.en)
//This code is just used for study
#include <math.h>
#include <string.h>
#include "mex.h"
void histc_c(double *input, double *edges, int len_input, int bins, double *outfreq, int *count)//just deal with one dimension
{
int i, j;
for (i = 0; i<len_input; i++)//i-th input
{
for (j = 0; j< bins; j++)
{
if (input[i] >= edges[j] && input[i]<edges[j + 1])
{
outfreq[j]++;
(*count)++;
}
}
if (input[i] == edges[bins])
{
outfreq[bins]++;
(*count)++;
}
}
}
static void ent(double *dinsts, double *labels, double *X, int M, int N, int bins, int nr_class, double *entX, double *entXY)
{
//dinsts([M N])
//probX = zeros(bins,dim);
//probXY = zeros(bins*nr_class,dim);
double *temp, *temp_;
double *frequency;
double *this_;
int i, j, k;
int count, count_;
double tem, tem_;
double *probX, *probXY;
temp = new double[M];//memory one column of dinsts
temp_ = new double[M];
frequency = new double[bins + 1];//bins = length(X)-1
this_ = new double[nr_class*bins];//nr_class = max(labels)
//this = zeros(1,nr_class*bins);
probX = new double[bins*N];
probXY = new double[bins*nr_class * N];
memset(probX, 0, (bins*N)*sizeof(double));
memset(probXY, 0, (bins*nr_class * N)*sizeof(double));
//calculate probX,probXY
for (i = 0; i < N; i++)//i-th column of dinsts
{
//calculate probX
memset(frequency, 0, (bins + 1)*sizeof(double));//each column gets its frequency
for (j = 0; j < M; j++)//j-th row of dinsts
{
temp[j] = dinsts[j + i * M];//temp = dinsts(:,i)
}
count = 0;
histc_c(temp, X, M, bins, frequency, &count);//frequency = histc(temp, X);
for (j = 0; j < bins + 1; j++)
{
//frequency[j] = frequency[j] / (count + 1.0e-6);//frequency = frequency / sum(frequency);
frequency[j] = frequency[j] / (count );
}
//probX = zeros(bins,dim);
for (j = 0; j < bins; j++)
{
probX[j + i * bins] = frequency[j];//probX(:,i) = frequency(1:end-1);
}
//calculate probXY
memset(this_, 0, (nr_class*bins)*sizeof(double));//this = zeros(1,nr_class*bins);
tem = 0;
tem_ = 0;
for (j = 1; j <= nr_class; j++)//for j = 1:nr_class
{
count_ = 0;//for temp_
memset(frequency, 0, (bins + 1)*sizeof(double));
for (k = 0; k < M; k++)
{
if (labels[k] == j)//labels = 1 or 2
{
temp_[count_] = temp[k];//temp(labels==j)
count_++;
}
}
count = 0;//for frequency
histc_c(temp_, X, count_, bins, frequency, &count);//frequency = histc(temp(labels==j), X);
for (k = (j - 1)*bins; k < j*bins; k++)
{
this_[k] = frequency[k - (j - 1)*bins];//this((j-1)*bins+1:j*bins) = frequency(1:end-1);
//!!!note that 2+1:4 = 3:4;
tem += this_[k];
tem_++;
}
}
for (j = 0; j < tem_; j++)
{
//this_[j] = this_[j] / (tem + 1.0e-6);//this = this / sum(this);
this_[j] = this_[j] / (tem);
}
//probXY = zeros(bins*nr_class,dim)
for (j = 0; j < bins*nr_class; j++)
{
probXY[j + i * (bins*nr_class)] = this_[j];//probXY(:,i) = this;
}
}
//calculate entX,entXY
for (i = 0; i < N; i++)
{
entX[i] = - probX[i * 2] * (log(probX[i * 2] + 1.0e-6) / log(2.0)) - probX[i * 2 + 1] * (log(probX[i * 2 + 1] + 1.0e-6) / log(2.0));//entX = -sum(probX.*log2(probX + 1.0e-6))
entXY[i] = - probXY[i * 4] * (log(probXY[i * 4] + 1.0e-6) / log(2.0)) - probXY[i * 4 + 1] * (log(probXY[i * 4 + 1] + 1.0e-6) / log(2.0)) - probXY[i * 4 + 2] * (log(probXY[i * 4 + 2] + 1.0e-6) / log(2.0)) - probXY[i * 4 + 3] * (log(probXY[i * 4 + 3] + 1.0e-6) / log(2.0));//entXY = -sum(probXY.*log2(probXY + 1.0e-6))
}
delete[] temp;
delete[] temp_;
delete[] frequency;
delete[] this_;
delete[] probX;
delete[] probXY;
}
//dinsts, labels, X, row, dim, bins, nr_class
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray*prhs[])
{
double *dinsts;
double *labels;
double *X;
int M, N, bins, nr_class;
double *entX, *entXY;
dinsts = mxGetPr(prhs[0]);
labels = mxGetPr(prhs[1]);
X = mxGetPr(prhs[2]);
M = (int)mxGetScalar(prhs[3]);
N = (int)mxGetScalar(prhs[4]);
bins = (int)mxGetScalar(prhs[5]);
nr_class = (int)mxGetScalar(prhs[6]);
//entX = [1,dim];
//entXY = [1,dim];
plhs[0] = mxCreateDoubleMatrix(1,(mwSize)N, mxREAL);
plhs[1] = mxCreateDoubleMatrix(1,(mwSize)N, mxREAL);
entX = mxGetPr(plhs[0]);
entXY = mxGetPr(plhs[1]);
ent(dinsts, labels, X, M, N, bins, nr_class, entX, entXY);
}
直接mex ent.cpp就可以使用了。只需要注意输进来的参数要设置好。