>DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3),
+ C = rep(1:2, 6), key = "A,B")
> DT
A B C
1: 1 1 1
2: 1 1 2
3: 1 1 1
4: 1 2 2
5: 2 2 1
6: 2 2 2
7: 2 3 1
8: 2 3 2
9: 3 3 1
10: 3 4 2
11: 3 4 1
12: 3 4 2
> duplicated(DT) #R中默认的是fromLast=FALSE,即若样本点重复出现,则取首次出现的;否则去最后一次出现的
[1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
> unique(DT) #把重复的两行去掉,即上面的TRUE对应的行
A B C
1: 1 1 1
2: 1 1 2
3: 1 2 2
4: 2 2 1
5: 2 2 2
6: 2 3 1
7: 2 3 2
8: 3 3 1
9: 3 4 2
10: 3 4 1
> duplicated(DT, by="B") #只看第“B”列
[1] FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
> unique(DT, by="B")#按照B列的结果返回数据,包含三列的数据
A B C
1: 1 1 1
2: 1 2 2
3: 2 3 1
4: 3 4 2
> duplicated(DT, by=c("A", "C"))#按照A\C两列的数据返回结果,原理同上
[1] FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
> unique(DT, by=c("A", "C"))
A B C
1: 1 1 1
2: 1 1 2
3: 2 2 1
4: 2 2 2
5: 3 3 1
6: 3 4 2
#data.table中duplicated与unique的用法
> DT = data.table(a=c(2L,1L,2L), b=c(1L,2L,1L)) # no key
> DT
a b
1: 2 1
2: 1 2
3: 2 1
> unique(DT) # rows 1 and 2 (row 3 is a duplicate of row 1)
a b
1: 2 1
2: 1 2
> DT = data.table(a=c(3.142, 4.2, 4.2, 3.142, 1.223, 1.223), b=rep(1,6))
> DT
a b
1: 3.142 1
2: 4.200 1
3: 4.200 1
4: 3.142 1
5: 1.223 1
6: 1.223 1
> unique(DT) # rows 1,2 and 5
a b
1: 3.142 1
2: 4.200 1
3: 1.223 1
> DT = data.table(a=tan(pi*(1/4 + 1:10)), b=rep(1,10)) # example from ?all.equal
> DT
a b
1: 1 1
2: 1 1
3: 1 1
4: 1 1
5: 1 1
6: 1 1
7: 1 1
8: 1 1
9: 1 1
10: 1 1
> length(unique(DT$a)) # 10 strictly unique floating point values
[1] 10
> all.equal(DT$a,rep(1,10)) # TRUE, all within tolerance of 1.0 #
[1] TRUE
> DT[,which.min(a)] # row 10, the strictly smallest floating point value
[1] 10
> identical(unique(DT),DT[1]) # TRUE, stable within tolerance
[1] FALSE
> identical(unique(DT),DT[10]) # FALSE
[1] FALSE
duplicated(DT)
unique(DT)
duplicated(DT, by="B")
unique(DT, by="B")
duplicated(DT, by=c("A", "C"))
unique(DT, by=c("A", "C"))
#data.table 中的用法 一定要弄清楚
DT = data.table(a=c(2L,1L,2L), b=c(1L,2L,1L)) # no key
unique(DT) # rows 1 and 2 (row 3 is a duplicate of row 1)
DT = data.table(a=c(3.142, 4.2, 4.2, 3.142, 1.223, 1.223), b=rep(1,6))
unique(DT) # rows 1,2 and 5
DT = data.table(a=tan(pi*(1/4 + 1:10)), b=rep(1,10)) # example from ?all.equal
length(unique(DT$a)) # 10 strictly unique floating point values
#a中都是浮点型数据,所以每个值都不一样;b中都是整型数据,所以都是严格相等的
all.equal(DT$a,rep(1,10)) # TRUE, all within tolerance of 1.0 #
#
DT[,which.min(a)] # row 10, the strictly smallest floating point value
#第10行是浮点型数据最小的
identical(unique(DT),DT[1]) # TRUE, stable within tolerance
identical(unique(DT),DT[10]) # FALSE
# fromLast=TRUE
DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3),
C = rep(1:2, 6), key = "A,B")
duplicated(DT, by="B", fromLast=TRUE)
unique(DT, by="B", fromLast=TRUE)
# anyDuplicated
anyDuplicated(DT, by=c("A", "B")) # 3L
any(duplicated(DT, by=c("A", "B"))) # TRUE
# uniqueN, unique rows on key columns
uniqueN(DT, by = key(DT))
# uniqueN, unique rows on all columns
uniqueN(DT)
# uniqueN while grouped by "A"
DT[, .(uN=uniqueN(.SD)), by=A]
# uniqueN's na.rm=TRUE
x = sample(c(NA, NaN, runif(3)), 10, TRUE)
uniqueN(x, na.rm = FALSE) # 5, default
uniqueN(x, na.rm=TRUE) # 3