data PlayTennis;
input Outlook $ Temperature $ Humidity $ Windy $ Play $;
cards;
sunny hot high false No
sunny hot high true No
overcast hot high false Yes
rain mild high false Yes
rain cool normal false Yes
rain cool normal true No
overcast cool normal true Yes
sunny mild high false No
sunny cool normal false Yes
rain mild normal false Yes
sunny mild normal true Yes
overcast mild high true Yes
overcast hot normal false Yes
rain mild high true No
;
run;
%macro calEntropy(data=, target=, out=);
proc freq data=&data;
tables
&target/out=freqs;
run;quit;
data &out;
set freqs end=eof;
retain entropy 0;
entropy + -1 * (percent / 100) * log2(percent /
100);
if eof then output;
keep &target entropy;
run;
%mend calEntropy;
%macro calPosteriorEntropy(data=, target=, splitVar=, out=);
proc sort data=&data;
by ⌖
run;
ods output crosstabfreqs=crosstabfreqs;
proc freq data=&data;
tables
(&splitVar)*&target/missing;
run;quit;
data freqs;
length varname $ 32;
length level $ 100;
set
crosstabfreqs(keep=&target
&splitVar frequency rowpercent);
where &target
ne '';
array varlist{*} $100
&splitVar;
do i=1 to dim(varlist);
if varlist(i) ne '' then do;
varname = scan("&splitVar", i);
level=strip(varlist(i));
return;
end;
end;
keep &target
varname level frequency rowpercent;
run;
proc sort data=freqs;
by varname level;
where varname ne '';
run;
data entropy;
set freqs;
by varname level;
retain h 0;
if NOT
MISSING(rowpercent);
if rowpercent ne 0
then
h + -1 * (rowpercent / 100) * log2(rowpercent / 100);
if last.level then
do;
output;
h = 0;
end;
keep varname level h;
run;
data innergrppercent;
length varname $ 32;
length level $ 100;
set
crosstabfreqs(keep=&target
&splitVar frequency percent);
where &target
eq '';
array varlist{*} $100
&splitVar;
do i=1 to dim(varlist);
if varlist(i) ne '' then do;
varname = scan("&splitVar", i);
level=strip(varlist(i));
return;
end;
end;
keep varname level frequency
percent;
run;
proc sort data=innergrppercent;
by varname level;
where varname ne '';
run;
data &out;
merge entropy innergrppercent;
by varname level;
retain postEntropy 0;
postEntropy + (percent / 100) * h;
if last.varname then
do;
output;
postEntropy = 0;
end;
keep varname
postEntropy;
run;
%mend calPosteriorEntropy;
%macro calInformationGains(data=, target=, splitVar=, out=);
* 运行时请去掉 % 和宏名字之间的空格,主要是为了绕过新浪播客的bug;
% calEntropy(data=&data,
target=&target, out=priteriorEntropy);
% calPosteriorEntropy(data=&data,
target=&target, splitVar=&splitVar,
out=posteriorEntropy);
data &out;
if _n_=1 then set priteriorEntropy;
retain entropy;
set posteriorEntropy;
inforgains = entropy-postEntropy;
keep varname entropy postEntropy
inforgains;
run;
proc sort data=&out;
by descending inforgains;
run;
%mend calInformationGains;
* 运行时请去掉 % 和宏名字之间的空格,主要是为了绕过新浪博客的bug;
% calInformationGains(data=PlayTennis, target=play,
splitVar=Outlook Temperature Humidity Windy,
out=informationGains);
计算结果:
entropy
varname
postEntropy
inforgains
0.94029
Outlook
0.69354
0.24675
0.94029
Humidity
0.78845
0.15184
0.94029
Windy
0.89216
0.04813
0.94029
Temperature
0.91106
0.02922
一旦亲自动手编码实现计算部分,就会发现有些资料里的小问题。