最大信息熵增益_决策树:信息熵和信息增益(一)

data PlayTennis;

input Outlook $ Temperature $ Humidity $ Windy $ Play $;

cards;

sunny hot high false No

sunny hot high true No

overcast hot high false Yes

rain mild high false Yes

rain cool normal false Yes

rain cool normal true No

overcast cool normal true Yes

sunny mild high false No

sunny cool normal false Yes

rain mild normal false Yes

sunny mild normal true Yes

overcast mild high true Yes

overcast hot normal false Yes

rain mild high true No

;

run;

%macro calEntropy(data=, target=, out=);

proc freq data=&data;

tables

&target/out=freqs;

run;quit;

data &out;

set freqs end=eof;

retain entropy 0;

entropy + -1 * (percent / 100) * log2(percent /

100);

if eof then output;

keep &target entropy;

run;

%mend calEntropy;

%macro calPosteriorEntropy(data=, target=, splitVar=, out=);

proc sort data=&data;

by ⌖

run;

ods output crosstabfreqs=crosstabfreqs;

proc freq data=&data;

tables

(&splitVar)*&target/missing;

run;quit;

data freqs;

length varname $ 32;

length level $ 100;

set

crosstabfreqs(keep=&target

&splitVar frequency rowpercent);

where &target

ne '';

array varlist{*} $100

&splitVar;

do i=1 to dim(varlist);

if varlist(i) ne '' then do;

varname = scan("&splitVar", i);

level=strip(varlist(i));

return;

end;

end;

keep &target

varname level frequency rowpercent;

run;

proc sort data=freqs;

by varname level;

where varname ne '';

run;

data entropy;

set freqs;

by varname level;

retain h 0;

if NOT

MISSING(rowpercent);

if rowpercent ne 0

then

h + -1 * (rowpercent / 100) * log2(rowpercent / 100);

if last.level then

do;

output;

h = 0;

end;

keep varname level h;

run;

data innergrppercent;

length varname $ 32;

length level $ 100;

set

crosstabfreqs(keep=&target

&splitVar frequency percent);

where &target

eq '';

array varlist{*} $100

&splitVar;

do i=1 to dim(varlist);

if varlist(i) ne '' then do;

varname = scan("&splitVar", i);

level=strip(varlist(i));

return;

end;

end;

keep varname level frequency

percent;

run;

proc sort data=innergrppercent;

by varname level;

where varname ne '';

run;

data &out;

merge entropy innergrppercent;

by varname level;

retain postEntropy 0;

postEntropy + (percent / 100) * h;

if last.varname then

do;

output;

postEntropy = 0;

end;

keep varname

postEntropy;

run;

%mend calPosteriorEntropy;

%macro calInformationGains(data=, target=, splitVar=, out=);

* 运行时请去掉 % 和宏名字之间的空格,主要是为了绕过新浪播客的bug;

% calEntropy(data=&data,

target=&target, out=priteriorEntropy);

% calPosteriorEntropy(data=&data,

target=&target, splitVar=&splitVar,

out=posteriorEntropy);

data &out;

if _n_=1 then set priteriorEntropy;

retain entropy;

set posteriorEntropy;

inforgains = entropy-postEntropy;

keep varname entropy postEntropy

inforgains;

run;

proc sort data=&out;

by descending inforgains;

run;

%mend calInformationGains;

* 运行时请去掉 % 和宏名字之间的空格,主要是为了绕过新浪博客的bug;

% calInformationGains(data=PlayTennis, target=play,

splitVar=Outlook Temperature Humidity Windy,

out=informationGains);

计算结果:

entropy

varname

postEntropy

inforgains

0.94029

Outlook

0.69354

0.24675

0.94029

Humidity

0.78845

0.15184

0.94029

Windy

0.89216

0.04813

0.94029

Temperature

0.91106

0.02922

一旦亲自动手编码实现计算部分,就会发现有些资料里的小问题。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值