############################################################
自定义分层抽样函数:
############################################################
stratified <- function(df, group, size, select = NULL,
replace = FALSE, bothSets = FALSE) {
if (is.null(select)) {
df <- df
} else {
if (is.null(names(select))) stop("'select' must be a named list")
if (!all(names(select) %in% names(df)))
stop("Please verify your 'select' argument")
temp <- sapply(names(select),
function(x) df[[x]] %in% select[[x]])
df <- df[rowSums(temp) == length(select), ]
}
df.interaction <- interaction(df[group], drop = TRUE)
df.table <- table(df.interaction)
df.split <- split(df, df.interaction)
if (length(size) > 1) {
if (length(size) != length(df.split))
stop("Number of groups is ", length(df.split),
" but number of sizes supplied is ", length(size))
if (is.null(names(size))) {
n <- setNames(size, names(df.split))
message(sQuote("size"), " vector entered as:\n\nsize = structure(c(",
paste(n, collapse = ", "), "),\n.Names = c(",
paste(shQuote(names(n)), collapse = ", "), ")) \n\n")
} else {
ifelse(all(names(size) %in% names(df.split)),
n <- size[names(df.split)],
stop("Named vector supplied with names ",
paste(names(size), collapse = ", "),
"\n but the names for the group levels are ",
paste(names(df.split), collapse = ", ")))
}
} else if (size < 1) {
n <- round(df.table * size, digits = 0)
} else if (size >= 1) {
if (all(df.table >= size) || isTRUE(replace)) {
n <- setNames(rep(size, length.out = length(df.split)),
names(df.split))
} else {
message(
"Some groups\n---",
paste(names(df.table[df.table < size]), collapse = ", "),
"---\ncontain fewer observations",
" than desired number of samples.\n",
"All observations have been returned from those groups.")
n <- c(sapply(df.table[df.table >= size], function(x) x = size),
df.table[df.table < size])
}
}
temp <- lapply(
names(df.split),
function(x) df.split[[x]][sample(df.table[x],
n[x], replace = replace), ])
set1 <- do.call("rbind", temp)
if (isTRUE(bothSets)) {
set2 <- df[!rownames(df) %in% rownames(set1), ]
list(SET1 = set1, SET2 = set2)
} else {
set1
}
}
================================================
参数说明:
#######################################################
The arguments to stratified
are:
df
: The inputdata.frame
group
: A character vector of the column or columns that make up the "strata".size
: The desired sample size.- If
size
is a value less than 1, a proportionate sample is taken from each stratum. - If
size
is a single integer of 1 or more, that number of samples is taken from each stratum. - If
size
is a vector of integers, the specified number of samples is taken for each stratum. It is recommended that you use a named vector. For example, if you have two strata, "A" and "B", and you wanted 5 samples from "A" and 10 from "B", you would entersize = c(A = 5, B = 10)
.
- If
select
: This allows you to subset the groups in the sampling process. This is alist
. For instance, if yourgroup
variable was "Group", and it contained three strata, "A", "B", and "C", but you only wanted to sample from "A" and "C", you can useselect = list(Group = c("A", "C"))
.replace
: For sampling with replacement.
########################################################
例子说明:
########################################################
set.seed(1)
##创建数据框 dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100,replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)), D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M","F"), 100, replace = TRUE))
##
library(devtools)
source_gist("https://gist.github.com/mrdwab/6424112")
## [1] "https://raw.github.com/gist/6424112"
## SHA-1 hash of file is 0006d8548785ec8a5651c3dd599648cc88d153a4
# Generate a couple of sample data.frames to play with
set.seed(1)
dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100,
replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)),
D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M",
"F"), 100, replace = TRUE))
# What do the data look like in general?
summary(dat1)
## ID A B C D E
## Min. : 1.0 AA:13 Min. :-1.9144 Min. :0.000 CA:23 F:54
## 1st Qu.: 25.8 BB:25 1st Qu.:-0.6141 1st Qu.:0.300 NY:42 M:46
## Median : 50.5 CC:19 Median :-0.1176 Median :0.650 TX:35
## Mean : 50.5 DD:26 Mean :-0.0176 Mean :0.825
## 3rd Qu.: 75.2 EE:17 3rd Qu.: 0.5382 3rd Qu.:1.200
## Max. :100.0 Max. : 2.4016 Max. :2.900
# Let's take a 10% sample from all -A- groups in dat1
stratified(dat1, "A", 0.1)
## ID A B C D E
## 92 92 AA 1.1766 1.0 CA F
## 86 86 BB -1.5364 0.1 NY F
## 74 74 BB -0.1796 0.3 CA F
## 60 60 CC 1.6822 0.5 TX F
## 58 58 CC 0.9102 0.5 CA F
## 9 9 DD 0.5697 1.4 TX F
## 42 42 DD 1.2079 0.4 TX M
## 46 46 DD 0.5585 1.0 CA M
## 99 99 EE -1.2863 0.2 NY F
## 72 72 EE 1.3430 0.0 NY F
# Let's take a 10% sample from only 'AA' and 'BB' groups from -A- in dat1
stratified(dat1, "A", 0.1, select = list(A = c("AA", "BB")))
## ID A B C D E
## 69 69 AA 0.4942 0.6 CA M
## 54 54 BB 0.1580 0.3 TX M
## 5 5 BB 1.4330 1.5 NY F
# Let's take 5 samples from all -D- groups in dat1, specified by column
# number
stratified(dat1, group = 5, size = 5)
## ID A B C D E
## 73 73 BB -0.2146 0.6 CA F
## 45 45 CC 1.5868 1.2 CA F
## 87 87 DD -0.3010 0.6 CA M
## 13 13 DD 0.6897 1.1 CA F
## 24 24 AA -0.9341 0.1 CA M
## 40 40 CC 0.2671 0.9 NY F
## 83 83 BB 0.5315 0.6 NY M
## 1 1 BB 0.3981 0.5 NY F
## 56 56 AA 1.7673 2.5 NY M
## 88 88 AA -0.5283 1.2 NY M
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 79 79 DD -0.6817 0.2 TX M
## 27 27 AA -0.4433 0.8 TX M
## 51 51 CC -0.6204 0.4 TX F
# Let's take a sample from all -A- groups in dat1, where we specify the
# number wanted from each group
stratified(dat1, "A", size = c(3, 5, 4, 5, 2))
## 'size' vector entered as:
##
## size = structure(c(3, 5, 4, 5, 2), .Names = c("AA", "BB", "CC", "DD",
## "EE"))
## ID A B C D E
## 38 38 AA -0.3042 0.8 NY M
## 10 10 AA -0.1351 1.9 NY M
## 56 56 AA 1.7673 2.5 NY M
## 62 62 BB -0.4616 0.4 CA F
## 74 74 BB -0.1796 0.3 CA F
## 2 2 BB -0.6120 0.0 TX F
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 40 40 CC 0.2671 0.9 NY F
## 67 67 CC -0.3200 0.3 CA F
## 16 16 CC 0.1888 2.2 CA M
## 3 3 CC 0.3411 0.3 NY M
## 39 39 DD 0.3700 0.4 CA F
## 49 49 DD -1.2246 0.4 NY F
## 50 50 DD -0.4734 0.4 TX F
## 23 23 DD 0.6107 0.5 NY F
## 17 17 DD -1.8050 0.3 NY M
## 35 35 EE 0.5939 0.5 CA M
## 41 41 EE -0.5425 0.2 NY F
# Use a two-column strata: -E- and -D- -E- varies more slowly, so it is
# better to put that first
stratified(dat1, c("E", "D"), size = 0.15)
## ID A B C D E
## 84 84 BB -1.518394 0.6 CA F
## 34 34 AA -1.523567 1.5 CA F
## 53 53 CC -0.910922 1.6 CA M
## 36 36 DD 0.332950 0.2 CA M
## 97 97 CC 2.087167 0.5 NY F
## 49 49 DD -1.224613 0.4 NY F
## 5 5 BB 1.433024 1.5 NY F
## 37 37 DD 1.063100 1.5 NY M
## 17 17 DD -1.804959 0.3 NY M
## 33 33 CC 1.178087 0.2 NY M
## 14 14 BB 0.028002 0.9 TX F
## 90 90 AA -0.056897 0.0 TX F
## 28 28 BB 0.001105 2.1 TX F
## 80 80 EE -0.324270 0.3 TX M
## 22 22 BB -0.709946 0.1 TX M
# Use a two-column strata (-E- and -D-) but only interested in cases where
# -E- == 'M'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M"))
## ID A B C D E
## 69 69 AA 0.49419 0.6 CA M
## 32 32 CC -0.13518 1.0 CA M
## 12 12 AA -0.03924 0.2 NY M
## 100 100 DD -1.64061 1.0 NY M
## 56 56 AA 1.76729 2.5 NY M
## 79 79 DD -0.68166 0.2 TX M
## 80 80 EE -0.32427 0.3 TX M
## As above, but where -E- == 'M' and -D- == 'CA' or 'TX'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M", D = c("CA", "TX")))
## ID A B C D E
## 53 53 CC -0.9109 1.6 CA M
## 36 36 DD 0.3330 0.2 CA M
## 80 80 EE -0.3243 0.3 TX M
## 8 8 DD -1.0441 0.6 TX M
# Use a three-column strata: -E-, -D-, and -A-
s.out <- stratified(dat1, c("E", "D", "A"), size = 2)
## Some groups ---M.CA.BB, M.CA.EE--- contain fewer observations than desired
## number of samples. All observations have been returned from those groups.
list(head(s.out), tail(s.out))
## [[1]]
## ID A B C D E
## 92 92 AA 1.1766 1.0 CA F
## 34 34 AA -1.5236 1.5 CA F
## 24 24 AA -0.9341 0.1 CA M
## 69 69 AA 0.4942 0.6 CA M
## 56 56 AA 1.7673 2.5 NY M
## 10 10 AA -0.1351 1.9 NY M
##
## [[2]]
## ID A B C D E
## 61 61 EE -0.63574 0.2 NY M
## 18 18 EE 1.46555 1.4 NY M
## 21 21 EE 0.47551 2.3 TX F
## 70 70 EE -0.17733 0.0 TX F
## 80 80 EE -0.32427 0.3 TX M
## 77 77 EE -0.07356 0.3 TX M
# How many samples were taken from each strata?
table(interaction(s.out[c("E", "D", "A")]))
##
## F.CA.AA M.CA.AA F.NY.AA M.NY.AA F.TX.AA M.TX.AA F.CA.BB M.CA.BB F.NY.BB
## 2 2 0 2 2 2 2 1 2
## M.NY.BB F.TX.BB M.TX.BB F.CA.CC M.CA.CC F.NY.CC M.NY.CC F.TX.CC M.TX.CC
## 2 2 2 2 2 2 2 2 0
## F.CA.DD M.CA.DD F.NY.DD M.NY.DD F.TX.DD M.TX.DD F.CA.EE M.CA.EE F.NY.EE
## 2 2 2 2 2 2 0 1 2
## M.NY.EE F.TX.EE M.TX.EE
## 2 2 2
# Can we verify the message about group sizes?
names(which(table(interaction(dat1[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
names(which(table(interaction(s.out[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"