Data analysis and graphics with R - SECOND EDITION
ROBERT I. KABACOFF
ISBN: 9781617291388
Chapter 2. Creating a dataset
2.1 Table2.1
## R contains a wide variety of "structures"for holding data,
including "scalars","vectors","arrays","data frames", and "lists".
## The "data types" or "modes" that R can handle include "numeric","character","logical (TRUE/FALSE)","complex (imaginary numbers)", and "raw (bytes)".
## R refers to "case identifiers" as "rownames"
and "categorical variables"(nominal, ordinal) as "factors".
## "Factors"="nominal" or "ordinal" variables. They’re stored and treated specially in R
2.2.1 Vector
## "Vectors" are one-dimensional arrays that can hold "numeric data","character data", or "logical data".> a <-c(1,2,5,3,6,-2,4)> b <-c("one","two","three")> c <-c(TRUE, TRUE, FALSE, FALSE)
## Note that the data in a vector must be "only one type or mode"(numeric,
character, or logical).
You can’t mix modes in the same vector
## 取值
> b[c(1,3)][1]"one""three"
2.2.0 Scalars
## "Scalars" are "one-element vectors".> h <-3> h2 <- TRUE
2.2.2 Matrix
## 举例01> y <-matrix(1:20, nrow =5, ncol =4)> y
[,1][,2][,3][,4][1,]161116[2,]271217[3,]381318[4,]491419[5,]5101520
## 举例02> cells <-c(1,26,24,68)> rnames <-c("R1","R2")> cnames <-c("C1","C2")> mymatrix <-matrix(cells,
nrow =2, ncol =2, byrow = TRUE,
dimnames =list(rnames, cnames))> mymatrix
C1 C2
R1 126
R2 2468
## "Matrices" are "two-dimensional" and, like vectors, can contain "only one"
data type.
When there are "more than two dimensions", you use "arrays"(section 2.2.3)
## 引用matrix
> x <-matrix(1:10, nrow =2)> x
[,1][,2][,3][,4][,5][1,]13579[2,]246810> x[2,][1]246810> x[,2][1]34> x[1,4][1]7> x[1,c(4,5)][1]79
2.2.3 Array
> dimx <-c("A1","A2")> dimy <-c("B1","B2","B3")> dimz <-c("C1","C2","C3","C4")> A <-array(1:24,c(2,3,4),
dimnames =list(dimx, dimy, dimz))> A
,, C1
B1 B2 B3
A1 135
A2 246,, C2
B1 B2 B3
A1 7911
A2 81012,, C3
B1 B2 B3
A1 131517
A2 141618,, C4
B1 B2 B3
A1 192123
A2 202224> A[1,1,3][1]13
## Like matrices, they must be a single mode.
2.2.4 Data Frame
> patientID <-c(1,2,3,4)> age <-c(25,34,28,52)> diabetes <-c("Type1","Type2","Type1","Type1")> status <-c("Poor","Improved","Excellent","Poor")> patientdata <- data.frame(patientID, age, diabetes, status)> patientdata
patientID age diabetes status
1125 Type1 Poor
2234 Type2 Improved
3328 Type1 Excellent
4452 Type1 Poor
## Specifying elements of a data frame
> patientdata[1:2]
patientID age
1125223433284452> patientdata[c("diabetes","status")]
diabetes status
1 Type1 Poor
2 Type2 Improved
3 Type1 Excellent
4 Type1 Poor
> patientdata$age
[1]25342852>table(patientdata$diabetes, patientdata$status)
Excellent Improved Poor
Type1 102
Type2 010
## 方法01>summary(mtcars$mpg)
Min.1st Qu. Median Mean 3rd Qu. Max.10.4015.4319.2020.0922.8033.90>plot(mtcars$mpg,mtcars$disp)>plot(mtcars$mpg,mtcars$wt)
2.2.4.2 方法02:The “attach()” and “detach()” functions are best used when you’re analyzing “a single data frame” and you’re unlikely to have multiple objects with the same name.
>attach(mtcars)>summary(mpg)
Min.1st Qu. Median
10.4015.4319.20
Mean 3rd Qu. Max.20.0922.8033.90>plot(mpg, disp)>plot(mpg, wt)>detach(mtcars) ## The statement is optional but is good
programming practice and should be included routinely.
## 方法02的缺点:
> mpg <-c(25,36,47)>attach(mtcars)
The following object is masked _by_ .GlobalEnv:
mpg
> mpg
[1]253647
## 错误分析:mpg与mtcars中的重复
2.2.4.3 with()
## 方法03>with(mtcars,{+print(summary(mpg))+plot(mpg, disp)+plot(mpg, wt)+})
Min.1st Qu. Median
10.4015.4319.20
Mean 3rd Qu. Max.20.0922.8033.90
## 方法03缺点
>with(mtcars,{+ st <-summary(mpg)+plot(mpg, disp)+plot(mpg, wt)+})> st
Error: object 'st' not found
## 方法03,避免缺点的方法
>with(mtcars,{+ st <-summary(mpg) ## 注意符号的使用
+ stt <<-summary(mpg) ## 注意符号的使用
+plot(mpg, disp)+plot(mpg, wt)+})> stt
Min.1st Qu. Median
10.4015.4319.20
Mean 3rd Qu. Max.20.0922.8033.90
2.2.4.4CASE IDENTIFIERS
## specifies "patientID" as the variable to use in labeling cases on various
printouts and graphs produced by R
> patientdata <- data.frame(patientID, age, diabetes, status,
row.names = patientID )
2.2.5 Factors
## 分类变量
> diabetes <-c("Type1","Type2","Type1","Type1")>class(diabetes) ## 注意区别01[1]"character"> diabetes <-factor(diabetes)> diabetes
[1] Type1 Type2 Type1 Type1
Levels: Type1 Type2
>class(diabetes) ## 注意区别02[1]"factor"> diabeteses <-c("Type0")
## 有序分类变量
> status <-c("Poor","Improved","Excellent","Poor")>class(status)[1]"character"> status <-factor(status, ordered = TRUE)>class(status)[1]"ordered""factor"> status
[1] Poor Improved Excellent Poor
Levels: Excellent < Improved < Poor ## R语言默认按照字母排序,但不符合统计要求
## 按照统计要求,定义顺序
> status <-factor(status, ordered = TRUE,+ levels =c("Poor","Improved","Excellent"))> status
[1] Poor Improved Excellent Poor
Levels: Poor < Improved < Excellent ## 根据要求定义顺序
## Assigns the levels as 1= Poor,2= Improved,3= Excellent.
Be sure the specified levels match your actual data values.
Any data values not in the list will be set to "missing".
## label
> sex <-c("1","2","3")> sex <-factor(sex, levels =c(1,2), labels =c("Male","Female"))> sex
[1] Male Female <NA>
Levels: Male Female
> patientID <-c(1,2,3,4)> age <-c(25,34,28,52)> diabetes <-c("Type1","Type2","Type1","Type1")> status <-c("Poor","Improved","Excellent","Poor")> patientdata <- data.frame(patientID, age, diabetes, status)>str(patientdata) ## 对比区别01'data.frame':4 obs. of 4 variables:
$ patientID: num 1234
$ age : num 25342852
$ diabetes : Factor w/2 levels "Type1","Type2":1211
$ status : Factor w/3 levels "Excellent","Improved",..:3213> diabetes.fac <-factor(diabetes)> status.ord <-factor(status, ordered = TRUE)> patientdata.fac <- data.frame(patientID, age, diabetes.fac, status.ord)>str(patientdata.fac) ## 对比区别02'data.frame':4 obs. of 4 variables:
$ patientID : num 1234
$ age : num 25342852
$ diabetes.fac: Factor w/2 levels "Type1","Type2":1211
$ status.ord : Ord.factor w/3 levels "Excellent"<"Improved"<..:3213>summary(patientdata.fac)
patientID age diabetes.fac status.ord
Min.:1.00 Min.:25.00 Type1:3 Excellent:11st Qu.:1.751st Qu.:27.25 Type2:1 Improved :1
Median :2.50 Median :31.00 Poor :2
Mean :2.50 Mean :34.753rd Qu.:3.253rd Qu.:38.50
Max.:4.00 Max.:52.00
2.2.6 Lists
## Lists are "the most complex" of the R data types.> g <-"My"> h <-c(25,26,18,39)> j <-matrix(1:10, nrow =5)> k <-c("one","two")> mylist <-list(title <- g, ages = h, j, k)> mylist
[[1]][1]"My"
$ages
[1]25261839[[3]][,1][,2][1,]16[2,]27[3,]38[4,]49[5,]510[[4]][1]"one""two"> mylist[[2]][1]25261839> mylist[["ages"]][1]25261839> mylist$ages
[1]25261839
## The "best" way to read an Excel file is to export it to "a comma-delimited"
file from Excel and import it into R using the method described earlier.
2.6 Summary
One of the most challenging tasks in data analysis is data preparation. We’ve made a good start in this chapter by outlining the various structures that R provides for holding data and the many methods available for importing data from both keyboard and external sources.