可以通过Proc Freq来获得名义变量的类别及每个类别的频数,或者使用SQL的 select distinct+group by方式。有时,我们需要确保样本中的类别数能够保持和总体的的类别数是一致的,因此需要比较样本的类别数和总体的类别数是否一致。
以下程序就用于实现比较两个数据集的两个名义变量的类别是否一致。
(1)获取样本(Sample)和总体(Base)数据集中的类别名称,并将类别数量保存到宏变量NB和NS中,并创建比较结果数据集
Proc SQL noprint;
create table CatB as select distinct &Var from &Base;
select count(*) into:NB from CatB;
create table CatS as select distinct &Var from &Sample;
select count(*) into:NS from CatS;
create table &V_Result (Category Char(32), ExistsInSample num, Comment char(80));
run;
quit;
(2)将总体数据集(Base)的类别名称保存到宏变量中,以用于比较。其中_n_为观测记录
data _Null_;
set CatB;
call symput('C_'||left(_n_),&Var);
run;
(3)在样本数据集中查找是否存在每个总体类别名称(类别名称保存在宏变量&C_i中),如果不存在,在V_Result表中插入类别名称及不存在信息,如存在,插入类别名和存在信息。
proc SQL ;
%do i=1 %to &NB;
select count(*) into: Nxfrom CatS where &Var = "%trim(&&C_&i)";
%if &Nx =0 %then %do;
Insert into &V_Result values("%trim(&&C_&i)",0,'Category does not exist in sample.');
%end;
%if &Nx>0 %then %do;
Insert into &V_Result values("%trim(&&C_&i)",1,'Category exists in sample.');
%end;
%end;
quit;
(4)全部程序如下:
/*** Data Preparation for Data Mining Using SAS
by Mamdouh Refaat
Morgan Kaufmann, 2006
****/
%macro CatCompare(Base, Sample, Var, V_Result, I_St);
/*
Comparing the categories of the variable Var in
the Base dataset to test if all its categories are
present in the Sample. The results of the categories
are stored in V_Result and the final status (1/0)
is stored in I_St.
*/
/* Step 1: obtain the categories, and count them,
for both Base and Sample for the variable Var */
Proc SQL noprint;
create table CatB as select distinct &Var from &Base;
select count(*) into:NB from CatB ;
create table CatS as select distinct &Var from &Sample;
select count(*) into:NS from CatS;
create table &V_Result (Category Char(32),
ExistsInSample num,
Comment char(80));
run;
quit;
/* Step 2: Convert all the categories of the Base
into macro variables to use them in the search later */
data _Null_;
set CatB;
call symput('C_'||left(_n_),&Var);
run;
/* Step 3: Loop over the Base categories and find whether
all of them can be found in the sample */
proc SQL ;
%do i=1 %to &NB;
select count(*) into: Nx
from CatS where &Var = "%trim(&&C_&i)";
%if &Nx =0 %then %do;
Insert into &V_Result
values("%trim(&&C_&i)" ,
0,
'Category does not exist in sample.');
%end;
%if &Nx>0 %then %do;
Insert into &V_Result
values("%trim(&&C_&i)" ,
1,
'Category exists in sample.');
%end;
%end;
select min(ExistsInSample) into: Status from &V_Result;
run;
quit;
%let &I_St = &Status;
/* clean workspace */
proc datasets nodetails library =work;
*delete CatB CatC;
run;
quit;
%mend;