最大信息熵增益_决策树：信息熵和信息增益（一）

最新推荐文章于 2022-03-19 15:49:01 发布

gzq3

最新推荐文章于 2022-03-19 15:49:01 发布

阅读量147

点赞数

文章标签：最大信息熵增益

本文链接：https://blog.csdn.net/weixin_33642448/article/details/111958427

版权

data PlayTennis;

input Outlook $ Temperature $ Humidity $ Windy $ Play $;

cards;

sunny hot high false No

sunny hot high true No

overcast hot high false Yes

rain mild high false Yes

rain cool normal false Yes

rain cool normal true No

overcast cool normal true Yes

sunny mild high false No

sunny cool normal false Yes

rain mild normal false Yes

sunny mild normal true Yes

overcast mild high true Yes

overcast hot normal false Yes

rain mild high true No

;

run;

%macro calEntropy(data=, target=, out=);

proc freq data=&data;

tables

&target/out=freqs;

run;quit;

data &out;

set freqs end=eof;

retain entropy 0;

entropy + -1 * (percent / 100) * log2(percent /

100);

if eof then output;

keep &target entropy;

run;

%mend calEntropy;

%macro calPosteriorEntropy(data=, target=, splitVar=, out=);

proc sort data=&data;

by &target;

run;

ods output crosstabfreqs=crosstabfreqs;

proc freq data=&data;

tables

(&splitVar)*&target/missing;

run;quit;

data freqs;

length varname $ 32;

length level $ 100;

set

crosstabfreqs(keep=&target

&splitVar frequency rowpercent);

where &target

ne '';

array varlist{*} $100

&splitVar;

do i=1 to dim(varlist);

if varlist(i) ne '' then do;

varname = scan("&splitVar", i);

level=strip(varlist(i));

return;

end;

keep &target

varname level frequency rowpercent;

run;

proc sort data=freqs;

by varname level;

where varname ne '';

run;

data entropy;

set freqs;

by varname level;

retain h 0;

if NOT

MISSING(rowpercent);

if rowpercent ne 0

then

h + -1 * (rowpercent / 100) * log2(rowpercent / 100);

if last.level then

do;

output;

h = 0;

end;

keep varname level h;

run;

data innergrppercent;

length varname $ 32;

length level $ 100;

set

crosstabfreqs(keep=&target

&splitVar frequency percent);

where &target

eq '';

array varlist{*} $100

&splitVar;

do i=1 to dim(varlist);

if varlist(i) ne '' then do;

varname = scan("&splitVar", i);

level=strip(varlist(i));

return;

end;

keep varname level frequency

percent;

run;

proc sort data=innergrppercent;

by varname level;

where varname ne '';

run;

data &out;

merge entropy innergrppercent;

by varname level;

retain postEntropy 0;

postEntropy + (percent / 100) * h;

if last.varname then

do;

output;

postEntropy = 0;

end;

keep varname

postEntropy;

run;

%mend calPosteriorEntropy;

%macro calInformationGains(data=, target=, splitVar=, out=);

* 运行时请去掉 % 和宏名字之间的空格，主要是为了绕过新浪播客的bug;

% calEntropy(data=&data,

target=&target, out=priteriorEntropy);

% calPosteriorEntropy(data=&data,

target=&target, splitVar=&splitVar,

out=posteriorEntropy);

data &out;

if _n_=1 then set priteriorEntropy;

retain entropy;

set posteriorEntropy;

inforgains = entropy-postEntropy;

keep varname entropy postEntropy

inforgains;

run;

proc sort data=&out;

by descending inforgains;

run;

%mend calInformationGains;

* 运行时请去掉 % 和宏名字之间的空格，主要是为了绕过新浪博客的bug;

% calInformationGains(data=PlayTennis, target=play,

splitVar=Outlook Temperature Humidity Windy,

out=informationGains);

计算结果：

entropy

varname

postEntropy

inforgains

0.94029

Outlook

0.69354

0.24675

0.94029

Humidity

0.78845

0.15184

0.94029

Windy

0.89216

0.04813

0.94029

Temperature

0.91106

0.02922

一旦亲自动手编码实现计算部分，就会发现有些资料里的小问题。

gzq3

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
最大信息熵增益_决策树：信息熵和信息增益（一）

data PlayTennis;input Outlook $ Temperature $ Humidity $ Windy $ Play $;cards;sunny hot high false Nosunny hot high true Noovercast hot high false Yesrain mild high false Yesrain cool normal false Yes...
复制链接

扫一扫