sas中,很多情况下 data setp能够搞定的,proc sql也可以搞定。论坛里面经常碰到一题多解,例如汇总、重复数据、数据合并等问题。
很多网友也曾作过一些汇总,且巧浏览网页时发现NESUG 2011的一篇文章总结的相当齐全,特此摘录文中例子。感兴趣的网友可以自己google,原文题目《PROC SQL for DATA Step Die-hards》,作者:Christianna S. Williams。
EXAMPLE 1: SUBSETTING VARIABLES (COLUMNS)
*DATA step code;
DATA selvar1 ;
SET ex.admissions (KEEP = pt_id admdate disdate);
RUN;
*SQL code;
PROC SQL;
CREATE TABLE selvar2 AS
SELECT pt_id, admdate, disdate
FROM ex.admissions ;
QUIT;
EXAMPLE 2: SELECTING OBSERVATIONS (ROWS)
*DATA step code;
DATA vahosp1 ;
SET ex.admissions (WHERE = (hosp EQ 3));
RUN;
*SQL code;
PROC SQL FEEDBACK;
CREATE TABLE vahosp2 AS
SELECT *
FROM ex.admissions
WHERE hosp EQ 3;
QUIT;
EXAMPLE 3: CREATING A NEW VARIABLE
*DATA step code;
DATA grouping ;
SET ex.admissions ;
LENGTH dxgrp $5 ;
IF primdx EQ: '410' THEN dxgrp = 'MI' ;
ELSE IF primdx EQ: '428' THEN dxgrp = 'CHF';
ELSE dxgrp = 'other' ;
RUN;
*SQL code;
PROC SQL;
CREATE TABLE grouping2 AS
SELECT *,
CASE
WHEN primdx LIKE '410%' THEN 'MI'
WHEN primdx LIKE '428%' THEN 'CHF'
ELSE 'other'
END AS dxgrp
FROM ex.admissions;
QUIT;
EXAMPLE 4: SELECTING ROWS BASED ON A CREATED VARIABLE
*DATA Step code;
DATA twowks1 ;
SET ex.admissions (KEEP = pt_id hosp admdate disdate) ;
ATTRIB los LENGTH=4 LABEL='Length of Stay';
los = (disdate - admdate) + 1;
IF los GE 14 ;
RUN;
*SQL code;
PROC SQL;
CREATE TABLE twowks2 AS
SELECT pt_id, hosp, admdate, disdate,
(disdate-admdate) + 1 AS los LENGTH=4 LABEL=’Length of Stay’
FROM ex.admissions
WHERE CALCULATED los GE 14;
QUIT;
EXAMPLE 5: SELECTING ROWS IN ONE TABLE BASED ON INFORMATION FROM ANOTHER TABLE
*DATA Step Code;
PROC SORT DATA = ex.admissions OUT=admits;
BY hosp ;
RUN;
DATA vahosp1d (DROP = hospname) ;
MERGE admits (IN=adm)
ex.hospitals (IN=va KEEP = hosp_id hospname
RENAME = (hosp_id=hosp)
WHERE = (hospname EQ: 'Veteran'));
BY hosp ;
IF adm AND va;
RUN;
PROC SORT;
BY pt_id admdate;
RUN;
*PROC SQL Code;
PROC SQL ;
CREATE TABLE vahosp2d AS
SELECT *
FROM ex.admissions
WHERE hosp IN
(SELECT hosp_id
FROM ex.hospitals
WHERE hospname LIKE "Veteran%")
ORDER BY pt_id, admdate ;
QUIT;
EXAMPLE 6: USING SUMMARY FUNCTIONS
*DATA Step Code;
DATA admsum1 ;
SET ex.admissions ;
BY pt_id;
** (1) Initialization;
IF FIRST.pt_id THEN DO;
nstays = 0;
minlos = .;
maxlos = .;
END;
** (2) Accumulation;
nstays = nstays + 1;
los = (disdate - admdate) + 1;
minlos = MIN(OF minlos los) ;
maxlos = MAX(OF maxlos los) ;
** (3) Output;
IF LAST.pt_id THEN OUTPUT ;
RETAIN nstays minlos maxlos ;
KEEP pt_id nstays minlos maxlos ;
RUN;
PROC SQL code:
PROC SQL;
CREATE TABLE admsum2 AS
SELECT pt_id, COUNT(*) AS nstays,
MIN(disdate - admdate + 1) AS minlos,
MAX(disdate - admdate + 1) AS maxlos
FROM ex.admissions
GROUP BY pt_id ;
QUIT;
EXAMPLE 7: SELECTION BASED ON SUMMARY FUNCTIONS
*DATA Step Code;
PROC SUMMARY DATA= ex.admissions ;
VAR bp_sys ;
OUTPUT OUT=bpstats MEAN(bp_sys)= mean_sys STD(bp_sys) = sd_sys ;
RUN;
DATA hi_sys1 ;
SET bpstats (keep=mean_sys sd_sys)
ex.admissions ;
IF _N_ EQ 1 THEN DO;
high = mean_sys + 2*(sd_sys) ;
low = mean_sys - 2*(sd_sys) ;
DELETE;
END;
RETAIN high low;
IF (bp_sys GE high) OR (bp_sys LE low) ;
DROP mean_sys sd_sys high low ;
RUN;
*PROC SQL Code;
PROC SQL ;
CREATE TABLE hi_sys2 AS
SELECT * FROM ex.admissions
WHERE (bp_sys GE
(SELECT MEAN(bp_sys)+ 2*STD(bp_sys)
FROM ex.admissions))
OR (bp_sys LE
(SELECT MEAN(bp_sys) - 2*STD(bp_sys)
FROM ex.admissions));
QUIT;
EXAMPLE 8: SELECTION BASED ON SUMMARY FUNCTION WITH “RE-MERGE”
*DATA Step Code;
PROC SUMMARY DATA= ex.admissions NWAY;
CLASS dest ;
VAR bp_sys ;
OUTPUT OUT=bpstats2 MEAN(bp_sys)=mean_sys STD(bp_sys)=sd_sys ;
RUN;
PROC SORT DATA = ex.admissions OUT=admissions;
BY dest ;
RUN;
DATA hi_sys3 ;
MERGE admissions (KEEP = pt_id bp_sys bp_dia dest)
bpstats2 (KEEP = dest mean_sys sd_sys);
BY dest ;
IF bp_sys GE mean_sys + 2*(sd_sys) OR
bp_sys LE mean_sys - 2*(sd_sys) ;
FORMAT mean_sys sd_sys 6.2;
RUN;
*PROC SQL Code;
PROC SQL;
CREATE TABLE hi_sys4 AS
SELECT pt_id, bp_sys, bp_dia, dest,
MEAN(bp_sys) AS mean_sys FORMAT=6.2,
STD(bp_sys) AS sd_sys FORMAT=6.2
FROM ex.admissions
GROUP BY dest
HAVING bp_sys GE (mean_sys + 2*sd_sys)
OR bp_sys LE (mean_sys – 2*sd_sys) ;
QUIT;
EXAMPLE 9: IDENTIFYING DUPLICATES
*DATA Step Code;
DATA selmd1 ;
SET ex.doctors (KEEP = md_id lastname hospadm
RENAME = (hospadm=hospital));
BY md_id ;
IF NOT (FIRST.md_id AND LAST.md_id) ;
RUN;
PROC SORT DATA=selmd1;
BY lastname hospital ;
RUN;
*PROC SQL Code;
PROC SQL ;
CREATE TABLE selmd2 AS
SELECT md_id, lastname, hospadm AS hospital
FROM ex.doctors
GROUP BY md_id
HAVING COUNT(*) GE 2
ORDER BY lastname, hospital ;
QUIT;
EXAMPLE 10: CREATION OF TWO DATA SETS FROM ONE
*DATA Step Code;
DATA admit06 admit07 ;
SET ex.admissions ;
IF YEAR(admdate) = 2006 THEN OUTPUT admit06;
ELSE IF YEAR(admdate) = 2007 THEN OUTPUT admit07;
RUN;
*PROC SQL Code;
PROC SQL ;
CREATE TABLE admit06 AS
SELECT *
FROM ex.admissions
WHERE YEAR(admdate) = 2006;
CREATE TABLE admit07 AS
SELECT * FROM ex.admissions
WHERE YEAR(admdate) = 2007;
QUIT;
EXAMPLE 11: CONCATENATION
*DATA Step Code;
DATA alladm1 ;
SET admit06 admit07 ;
BY pt_id ;
RUN;
*PROC SQL Code;
PROC SQL ;
CREATE TABLE alladm2 AS
SELECT *
FROM admit06
UNION ALL CORRESPONDING
SELECT *
FROM admit07
ORDER BY pt_id;
QUIT;
EXAMPLE 12: SELECTING RECORDS UNIQUE TO ONE TABLE
*DATA Step Code;
DATA only2006 ;
MERGE admit06 (IN=in06 keep = pt_id)
admit07 (IN=in07 keep = pt_id);
BY pt_id ;
IF in06 AND NOT in07 ;
IF FIRST.pt_id ;
RUN;
*PROC SQL Code;
PROC SQL ;
CREATE TABLE only2006 AS
SELECT pt_id
FROM admit06
EXCEPT
SELECT pt_id
FROM admit07;
QUIT;
*ALTERNATIVE PROC SQL Code;
PROC SQL ;
CREATE TABLE only2006_A AS
SELECT *
FROM admit06
WHERE pt_id NOT IN
(SELECT pt_ID FROM admit07) ;
QUIT;
EXAMPLE 13: INNER JOIN OF TWO TABLES
*DATA Step Code;
DATA admits1 ;
MERGE ex.admissions (IN=adm KEEP = pt_id admdate disdate hosp md)
ex.patients (IN=pts KEEP = id lastname sex primmd
RENAME = (id=pt_id));
BY pt_id ;
IF adm AND pts;
RUN;
*PROC SQL code;
PROC SQL ;
CREATE TABLE admits2 AS
SELECT pt_id, admdate, disdate, hosp, md, lastname, sex, primmd
FROM ex.admissions AS a,
ex.patients AS b
WHERE a.pt_id = b.id
ORDER BY a.pt_id, admdate ;
QUIT;
*Alternative PROC SQL code;
PROC SQL ;
CREATE TABLE admits2 AS
SELECT pt_id, admdate, disdate, hosp, md, lastname, sex, primmd
FROM ex.admissions INNER JOIN
ex.patients
ON pt_id = id
ORDER BY pt_id, admdate ;
QUIT;
EXAMPLE 14: JOIN OF THREE TABLES WITH ROW SELECTION
*DATA Step Code;
DATA died1 (RENAME = (disdate=dthdate)) ;
MERGE ex.admissions (IN=dth KEEP = pt_id disdate hosp dest
WHERE = (dest=9))
ex.patients (IN=pts KEEP = id birthdate RENAME = (id=pt_id));
BY pt_id ;
IF dth AND pts ;
agedth = FLOOR((disdate-birthdate)/365.25) ;
DROP dest birthdate ;
RUN;
PROC SORT DATA=died1;
BY hosp;
RUN;
DATA died1b ;
MERGE died1 (IN=dth RENAME=(hosp=hosp_id))
ex.hospitals (IN=hsp KEEP=hosp_id nbeds);
BY hosp_id ;
IF dth AND hsp ;
DROP hosp_id;
RUN;
PROC SORT;
BY pt_id ;
RUN;
*PROC SQL code;
PROC SQL ;
CREATE TABLE died2 AS
SELECT pt_id, nbeds, disdate AS dthdate,
INT((disdate-birthdate)/365.25) AS agedth
FROM ex.admissions,
ex.hospitals,
ex.patients
WHERE (pt_id = id) AND (hosp = hosp_id) AND dest EQ 9
ORDER BY pt_id ;
QUIT;
EXAMPLE 15: LEFT OUTER JOIN
*DATA Step Code;
PROC SORT DATA = ex.admissions (KEEP = hosp)
OUT=admits (RENAME=(hosp=hosp_id)) NODUPKEY;
BY hosp ;
RUN;
DATA hosps1 ;
MERGE ex.hospitals (IN=hosp)
admits (IN=adm);
BY hosp_id ;
IF hosp ;
hasadmit = adm ;
RUN;
*PROC SQL code;
PROC SQL ;
CREATE TABLE hosps2 AS
SELECT DISTINCT a.*, hosp IS NOT NULL AS hasadmit
FROM ex.hospitals a
LEFT JOIN
ex.admissions b
ON a.hosp_id = b.hosp ;
QUIT;
EXAMPLE 16: INNER JOIN WITH A SUBQUERY
*DATA Step Code;
DATA primdoc (DROP = primmd);
MERGE ex.admissions (IN=adm KEEP = pt_id admdate disdate hosp md)
ex.patients (IN=pts KEEP = id lastname primmd RENAME=(id=pt_id));
BY pt_id ;
IF adm AND pts AND (md EQ primmd) ;
RUN;
PROC SORT DATA=primdoc; BY md; RUN;
DATA doctors ;
SET ex.doctors (KEEP = md_id lastname);
BY md_id ;
IF FIRST.md_id ;
RUN;
DATA primdoc1a ;
MERGE primdoc (IN=p RENAME=(lastname=ptname md=md_id))
doctors (RENAME = (lastname=mdname));
BY md_id ;
IF p ;
RUN;
PROC SORT DATA=primdoc1a ;
BY pt_id admdate;
RUN;
*PROC SQL Code;
PROC SQL ;
CREATE TABLE primdoc2 AS
SELECT pt_id, admdate, disdate, hosp, md_id,
b.lastname AS ptname,
c.lastname AS mdname
FROM ex.admissions a,
ex.patients b,
(SELECT DISTINCT md_id, lastname
FROM ex.doctors) c
WHERE (a.pt_id EQ b.id) AND
(a.md EQ b.primmd) AND
(a.md EQ c.md_id)
ORDER BY a.pt_id, admdate ;
QUIT;
EXAMPLE 17: A CORRELATED SUBQUERY
*DATA Step Code;
PROC SORT DATA = ex.admissions (KEEP=md hosp) OUT = admits;
BY md;
RUN;
PROC SORT DATA = ex.doctors OUT=doctors NODUPKEY ;
BY md_id ;
RUN;
DATA vadocs1 (DROP = hosp);
MERGE doctors (IN=docs KEEP=md_id lastname)
admits (IN=adm WHERE=(hosp = 3) RENAME = (md=md_id)) ;
BY md_id;
IF docs AND adm AND FIRST.md_id ;
RUN;
PROC SORT;
BY lastname;
RUN;
*PROC SQL Code;
PROC SQL;
CREATE TABLE vadocs2 AS
SELECT DISTINCT md_id, lastname
FROM ex.doctors AS d
WHERE 3 IN (SELECT hosp
FROM ex.admissions AS a
WHERE d.md_id = a.md)
ORDER BY lastname;
QUIT;