%macro bagging(data = , y = , numx = , catx = , ntrees = 10);
***********************************************************;
* THIS SAS MACRO IS AN ATTEMPT TO IMPLEMENT BAGGING *;
* PROPOSED BY LEO BREIMAN (1996) *;
* ======================================================= *;
* PAMAMETERS: *;
* DATA : INPUT SAS DATA TABLE *;
* Y : RESPONSE VARIABLE WITH 0/1 VALUE *;
* NUMX : A LIST OF NUMERIC ATTRIBUTES *;
* CATX : A LIST OF CATEGORICAL ATTRIBUTES *;
* NTREES : # OF TREES TO DO THE BAGGING *;
* ======================================================= *;
* OUTPUTS: *;
* 1. A SAS CATALOG FILE NAMED "TREEFILES" IN THE WORKING *;
* DIRECTORY CONTAINING ALL SCORING FILES IN BAGGING *;
* 2. A LST FILE SHOWING ks STATISTICS OF THE BAGGING *;
* CLASSIFIER AND EACH TREE CLASSIFIER *;
* ======================================================= *;
* CONTACT: *;
* WENSUI.LIU@53.COM, LOSS FORECASTING & RISK MODELING *;
***********************************************************;
options mprint mlogic nocenter nodate nonumber;
*** a random seed value subject to change ***;
%let seed = 20110613;
*** assign a library to the working folder ***;
libname _path '';
*** generate a series of random seeds ***;
data _null_;
do i = 1 to &ntrees;
random = put(ranuni(&seed) * (10 ** 8), 8.);
name = compress("random"||put(i, 3.), ' ');
call symput(name, random);
end;
run;
*** clean up catalog files in the library ***;
proc datasets library = _path nolist;
delete TreeFiles tmp / memtype = catalog;
run;
quit;
proc sql noprint;
select count(*) into :nobs from &data where &y in (1, 0);
quit;
data _tmp1 (keep = &y &numx &catx _id_);
set &data;
_id_ + 1;
run;
%do i = 1 %to &ntrees;
%put &&random&i;
*** generate bootstrap samples for bagging ***;
proc surveyselect data = _tmp1 method = urs n = &nobs seed = &&random&i
out = sample&i(rename = (NumberHits = _hits)) noprint;
run;
*** generate data mining datasets for sas e-miner ***;
proc dmdb data = sample&i out = db_sample&i dmdbcat = cl_sample&i;
class &y &catx;
var &numx;
target &y;
freq _hits;
run;
*** create a sas temporary catalog to contain sas output ***;
filename out_tree catalog "_path.tmp.out_tree.source";
*** create decision tree mimicking CART ***;
proc split data = db_sample&i dmdbcat = cl_sample&i
criterion = gini
assess = impurity
maxbranch = 2
splitsize = 100
subtree = assessment
exhaustive = 0
nsurrs = 0;
code file = out_tree;
input &numx / level = interval;
input &catx / level = nominal;
target &y / level = binary;
freq _hits;
run;
*** create a perminant sas catalog to contain all tree outputs ***;
filename in_tree catalog "_path.TreeFiles.tree&i..source";
data _null_;
infile out_tree;
input;
file in_tree;
if _n_ > 3 then put _infile_;
run;
*** score the original data by each tree output file ***;
data _score&i (keep = p_&y.1 p_&y.0 &y _id_);
set _tmp1;
%include in_tree;
run;
*** calculate KS stat ***;
proc printto new print = lst_out;
run;
ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
proc npar1way wilcoxon edf data = _score&i;
class &y.;
var p_&y.1;
run;
proc printto;
run;
%if &i = 1 %then %do;
data _tmp2;
set _score&i;
run;
data _ks;
set _kstmp (keep = nvalue2);
tree_id = &i;
seed = &&random&i;
ks = round(nvalue2 * 100, 0.0001);
run;
%end;
%else %do;
data _tmp2;
set _tmp2 _score&i;
run;
data _ks;
set _ks _kstmp(in = a keep = nvalue2);
if a then do;
tree_id = &i;
seed = &&random&i;
ks = round(nvalue2 * 100, 0.0001);
end;
run;
%end;
%end;
*** aggregate predictions from all trees in the bag ***;
proc summary data = _tmp2 nway;
class _id_;
output out = _tmp3(drop = _type_ rename = (_freq_ = freq))
mean(p_&y.1) = mean(p_&y.0) = mean(&y) = ;
run;
*** calculate bagging KS stat ***;
proc printto new print = lst_out;
run;
ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
proc npar1way wilcoxon edf data = _tmp3;
class &y;
var p_&y.1;
run;
proc printto;
run;
data _ks;
set _ks _kstmp (in = a keep = nvalue2);
if a then do;
tree_id = 0;
seed = &seed;
ks = round(nvalue2 * 100, 0.0001);
end;
run;
proc sort data = _ks;
by tree_id;
run;
proc sql noprint;
select max(ks) into :max_ks from _ks where tree_id > 0;
select min(ks) into :min_ks from _ks where tree_id > 0;
select ks into :bag_ks from _ks where tree_id = 0;
quit;
*** summarize the performance of bagging classifier and each tree in the bag ***;
title "MAX KS = &max_ks, MIN KS = &min_ks, BAGGING KS = &bag_ks";
proc print data = _ks noobs;
var tree_id seed ks;
run;
title;
proc datasets library = _path nolist;
delete tmp / memtype = catalog;
run;
quit;
%mend bagging;
%let x1 = tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt
tot_rev_line rev_util bureau_score ltv tot_income;
%let x2 = purpose;
libname data 'D:SAS_CODEbagging';
�gging(data = data.accepts, y = bad, numx = &x1, catx = &x2, ntrees = 10);