- sas个人学习笔记
1.允许中文名 启用压缩
sas默认不支持数据集中的中文列名,需要用validvarname=any开启。
sas数据集启用压缩后可以节省很多空间,需要自行开启!
OPTION compress=yes;
OPTION validvarname=any;
2.读取同一目录下的相同名字的数据
data datapath.trans_all(drop=productname floor rename=(pname=productname));
set datapath.trans_all_: indsname=_tablelabel_;
format store $20.;
format pname $32.;
store = scan(_tablelabel_,3,"_");
pname=KTRUNCATE(productname,1,length(productname));
run;
3.iml模块
sas的矩阵模块可以和r python的矩阵思想一起学习
参考的文章:https://wenku.baidu.com/view/25bd65896137ee06eef91804.html
用法很详细。
/* 自己用iml写的标准差 */
/*
proc iml;
use A; read all var _NUM_ into A; close A;
use B; read all var _NUM_ into B; close B;
use C; read all var _NUM_ into C; close C;
*/
proc iml;
reset deflib=work;
use answer;
read all into frame;
row2=(frame-frame[,+]/7)##2;
result=sqrt(row2[,+]/6);
CREATE result FROM result;
APPEND from result;
quit;
run;
/* 计算矩阵所占内存大小 这段是网上找到的 用iml算还是很耗内存的*/
proc iml;
/* Compute gigabytes (GB) of RAM required for matrix with r rows and c columns */
start HowManyGigaBytes(Rows, Cols);
GB = 8 # Rows#Cols / 2##30; /* 1024##3 bytes in a gigabyte */
Fit2GB = choose(GB <= 2, "Yes", " No");
print Rows[F=COMMA9.] Cols[F=COMMA9.]
GB[F=6.2] Fit2GB[L="Fits into 2GB"];
finish;
/* test: rows cols */
sizes = {250000 1000,
6000000 350,
2000000 350,
628000 349,
500000 350};
run HowManyGigaBytes(sizes[,1], sizes[,2]);
proc iml symsize=40000000 worksize=40000000 ;
show space;
/* Compute gigabytes (GB) of RAM required for matrix with r rows and c columns */
start HowManyGigaBytes(Rows, Cols);
GB = 8 # Rows#Cols / 2##30; /* 1024##3 bytes in a gigabyte */
Fit2GB = choose(GB <= 2, "Yes", " No");
print Rows[F=COMMA9.] Cols[F=COMMA9.]
GB[F=6.2] Fit2GB[L="Fits into 2GB"];
finish;
/* test: rows cols */
sizes = {5000000 676};
run HowManyGigaBytes(sizes[,1], sizes[,2]);
4.分位数函数
proc univariate data=aus noprint;
var aus;
output out=stats pctlpts=10 20 30 40 50 60 70 80 90 pctlpre=p1 p2 p3 p4 p5 p6 p7 p8 p9;
run;
5.缺失值填充
这里就怀念fillna和na.replace了
data result;
set result;
array result[*] _NUMERIC_;
do j = 1 to dim(result);
if result(j) = . then result(j) = 0;
end;
drop j;
run;
6.抽样
proc surveyselect data=lanzhou.Inactive170830(rename=(Selected=Selected0)) out=lanzhou.Inactive170830
method=srs samprate = .05; /*抽样比例*/
*strata CLUSTER; /*分层抽样参数*/
run;
7.把大的数据集拆分----自己写的宏例子
%macro split(data=,k=);
data test;
set &data. end=eof nobs=count;
if eof then call symput('nobs', left(count));
run;
%let n=%sysfunc(int(%eval(&nobs/&k)));
%put &nobs &n;
%do i=1 %to &n;
data ttt.customer&i test;
set test;
if _N_<=&k then output test.customer&i;
else output test;
run;
%end;
%mend;
%split(data=test.totalforecast7,k=250000);
8.hash合并数据集----网上大神的例子
data bg.test_hash;
if _n_=0 then set bg.test_i;
if _n_ = 1 then do;
declare hash ttt(dataset:"bg.test_i");
ttt.definekey("i");
ttt.definedata(All:"yes");
ttt.definedone();
end;
call missing(of _all_);
set bg.test;
rc=ttt.find(key:i);
if rc=0 then do;end;
drop rc;
run;
9.得到一年前的同一天
purchase_date=intnx('year',purchase_date,2,"sameday");
10.将ytd销售转化为月度销售
data test;
set test1;
by name yr;
sls_mon=sales-lag(sales);
if first.yr then sls_mon=sales;
run;
11.变量聚类
proc contents data=out.path_cat_lan4 out=cont noprint;run;
proc sql noprint;
select kcompress("'"||name||"'")||"n " into : name1 separated by ' '
from cont
where name not in ('customer_id','diaoyanclassnew');
quit;
%put &name1.;
proc varclus data=out.path_cat_lan4(drop=customer_id) outstat=clus centroid short PLOTS(MAXPOINTS=400)
outtree=clust_tree;
var &name1.;
run;