data.table
- 查看代码[1]
data.table的基本框架
图片引自:https://rstudio.com/
创建data.table
setDT()
setDT()
适用于对'list', 'data.table', 'data.frame'这三种类型,它比as.data.table要快,是以传地址的方式直接修改对象。> fruit = data.frame(x=rep(c("apple","banana","orange"),each=2), y=c(1,3,6), z=1:6) x y z1 apple 1 12 apple 3 23 banana 6 34 banana 1 45 orange 3 56 orange 6 6> class(fruit)[1] "data.frame"> setDT(fruit)> class(fruit)[1] "data.table" "data.frame"
as.data.table()
as.data.table()
的适用范围更广data.table::copy()
复制数据起一个新的名字,因为data.table的部分函数在使用的过程中会直接对原来的数据进行改写,为了防止原来的数据被改变,使用拷贝的文件。
> dt x y z1: apple 1 12: apple 3 23: banana 6 34: banana 1 45: orange 3 56: orange 6 6
setnames(x,old,new)
设置x的列名> setnames(dt, c("x","y","z"), c("name", "number", "money"))> dt name number money1: apple 1 12: apple 3 23: banana 6 34: banana 1 45: orange 3 56: orange 6 6
setcolorder(x,neworder)
重新安排列的顺序
> setcolorder(dt, c("name", "money", "number"))> dt name money number1: apple 1 12: apple 2 33: banana 3 64: banana 4 15: orange 5 36: orange 6 6
对行 i 进行操作
按条件选择行
<
、>
、<=
、>=
、%in%
、!
、&
、|
、%like%
、%between%
、==
、is.na()
、!is.na()
...> dt[name == "banana",] name money number1: banana 3 62: banana 4 1
> dt[name %like% "e",] name money number1: apple 1 12: apple 2 33: orange 5 34: orange 6 6
> dt[number %between% c(3,7),] name money number1: apple 2 32: banana 3 63: orange 5 34: orange 6 6
对列 j 进行操作
按条件选取列
> dt[,2] money1: 12: 23: 34: 45: 56: 6
> dt[, -2] name number1: apple 12: apple 33: banana 64: banana 15: orange 36: orange 6
> dt[, c("name", "number")] name number1: apple 12: apple 33: banana 64: banana 15: orange 36: orange 6
使用
c("", "")
和.(col1, col2)
效果一样> dt[, .(name, number)] name number1: apple 12: apple 33: banana 64: banana 15: orange 36: orange 6
> dt[max(number),] d e c name money number1: 2 1 expensive orange 6 6
- 选择或删除某列
对列进行计算
sum()
、mean()
、median()
、min()
、max()
...> dt[, .(x = sum(number))] x1: 20
添加列
:=
直接在原数据上增加新列或替换旧列> dt[name == "apple", c := 1+2]> dt[name == "apple", c := 1+4]> dt name money number c1: apple 1 1 32: apple 2 3 33: banana 3 6 34: banana 4 1 35: orange 5 3 36: orange 6 6 3
替换时可以增加条件语句
> dt[, c := ifelse(money < 4, "cheap", "expensive")]> dt d e c name money number1: 2 0 cheap apple 1 12: 2 -2 cheap apple 2 33: 2 0 expensive banana 4 14: 2 2 cheap banana 3 65: 2 0 expensive orange 5 36: 2 1 expensive orange 6 6
同时增加两个及以上的列,有两种写法:
> dt[, `:=` (c = 1, d = 2)]> dt name money number c d1: apple 1 1 1 22: apple 2 3 1 23: banana 3 6 1 24: banana 4 1 1 25: orange 5 3 1 26: orange 6 6 1 2
删除列
> dt[, c := NULL]> dt name money number d1: apple 1 1 22: apple 2 3 23: banana 3 6 24: banana 4 1 25: orange 5 3 26: orange 6 6 2
更改某列的数据类型
as.integer()
、as.numeric()
、as.character()
、as.Date()
...> dt[, d := rnorm(6)]> dt name money number d1: apple 1 1 0.70790052: apple 2 3 -2.67208103: banana 3 6 2.39552924: banana 4 1 0.11275835: orange 5 3 -0.18998546: orange 6 6 1.5170863> dt[, e := as.integer(d)]> dt name money number d e1: apple 1 1 0.7079005 02: apple 2 3 -2.6720810 -23: banana 3 6 2.3955292 24: banana 4 1 0.1127583 05: orange 5 3 -0.1899854 06: orange 6 6 1.5170863 1
- 基本计算
- 直接写法
list写法
> dt[, c("c", "d") := list(1, 2)]
用by进行分组
基本操作
> dt[, name, by = .(number)] number name1: 1 apple2: 1 banana3: 3 apple4: 3 orange5: 6 banana6: 6 orange
每组进行计算
> dt[, .(c = sum(money)), by = number] number c1: 1 52: 3 73: 6 9
> dt[, c:= sum(money), by = number]> dt name money number d e c1: apple 1 1 0.7079005 0 52: apple 2 3 -2.6720810 -2 73: banana 3 6 2.3955292 2 94: banana 4 1 0.1127583 0 55: orange 5 3 -0.1899854 0 76: orange 6 6 1.5170863 1 9
取分组后每组的首行
> dt[, .SD[1], by = number] number name money d e c1: 1 apple 1 0.7079005 0 52: 3 apple 2 -2.6720810 -2 73: 6 banana 3 2.3955292 2 9
取分组后每组的尾行
> dt[, .SD[.N], by = number] number name money d e c1: 1 banana 4 0.1127583 0 52: 3 orange 5 -0.1899854 0 73: 6 orange 6 1.5170863 1 9
- 分组
data.table的常用函数
setcolorder(x,neworder)
重新安排列的顺序> setcolorder(dt, c("d","e","c","name", "money", "number"))> dt d e c name money number1: 0.1127583 0 5 banana 4 12: 0.7079005 0 5 apple 1 13: -0.1899854 0 7 orange 5 34: -2.6720810 -2 7 apple 2 35: 1.5170863 1 9 orange 6 66: 2.3955292 2 9 banana 3 6
setorder(x, order1, -order2)
重新安排行的顺序先对order1进行升序,再在order1的基础上对order2进行降序
> setorder(dt, number, -money)> dt name money number d e c1: banana 4 1 0.1127583 0 52: apple 1 1 0.7079005 0 53: orange 5 3 -0.1899854 0 74: apple 2 3 -2.6720810 -2 75: orange 6 6 1.5170863 1 96: banana 3 6 2.3955292 2 9
unique()
去除重复- 根据by这列提取非重复的行
uniqueN(dt, by = c(""))
计数非重复的行
> unique(dt, by = c("name")) d e c name money number1: 0.1127583 0 5 banana 4 12: 0.7079005 0 5 apple 1 13: -0.1899854 0 7 orange 5 3> uniqueN(dt, by = c("name"))[1] 3
key(dt, colname)
设置索引setkey(dt, NULL)
去除索引⚠️:当提取的索引是数字时格式不同
> setkey(dt, name)> dt d e c name money number1: -2.6720810 -2 7 apple 2 32: 0.7079005 0 5 apple 1 13: 0.1127583 0 5 banana 4 14: 2.3955292 2 9 banana 3 65: -0.1899854 0 7 orange 5 36: 1.5170863 1 9 orange 6 6> dt["banana", ] d e c name money number1: 0.1127583 0 5 banana 4 12: 2.3955292 2 9 banana 3 6
> setkey(dt, c)> dt["7", ]Error in bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, : Incompatible join types: x.c (integer) and i.V1 (character)Called from: bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose = verbose)Browse[1]> Q> dt[.(7)] d e c name money number1: -2.6720810 -2 7 apple 2 32: -0.1899854 0 7 orange 5 3
> setkey(dt, number, name)> dt d e c name money number1: 0.7079005 0 5 apple 1 12: 0.1127583 0 5 banana 4 13: -2.6720810 -2 7 apple 2 34: -0.1899854 0 7 orange 5 35: 2.3955292 2 9 banana 3 66: 1.5170863 1 9 orange 6 6
> dt[.(3,"apple")] d e c name money number1: -2.672081 -2 7 apple 2 3> dt[.("orange", 3:6)] d e c name money number1: -0.1899854 0 7 orange 5 32: NA NA NA orange NA 43: NA NA NA orange NA 54: 1.5170863 1 9 orange 6 6> dt[.("orange", 3:6), roll = TRUE] d e c name money number1: -0.1899854 0 7 orange 5 32: -0.1899854 0 7 orange 5 43: -0.1899854 0 7 orange 5 54: 1.5170863 1 9 orange 6 6> dt[.("orange", 3:6), roll = -Inf] d e c name money number1: -0.1899854 0 7 orange 5 32: 1.5170863 1 9 orange 6 43: 1.5170863 1 9 orange 6 54: 1.5170863 1 9 orange 6 6> dt[.("orange", 3:6), nomatch = 0] d e c name money number1: -0.1899854 0 7 orange 5 32: 1.5170863 1 9 orange 6 6> dt[!.("orange", 3)] d e c name money number1: 0.7079005 0 5 apple 1 12: -2.6720810 -2 7 apple 2 33: 0.1127583 0 5 banana 4 14: 2.3955292 2 9 banana 3 65: 1.5170863 1 9 orange 6 6
检查索引
haskey(dt)
: 返回逻辑值,检查是否存在索引key(dt)
:检查索引内容针对索引进行筛选
⚠️:
roll = TRUE
没有的信息用上一条代替⚠️:
roll = -Inf
没有的信息用下一条代替- 可以设置多个索引
> haskey(dt)[1] TRUE> key(dt)[1] "number" "name"
可以使用索引简化计算
举例1:计算name为apple所在行的number值总和
> setkey(dt, name)> dt["apple", sum(number)][1] 4> dt d e c name money number1: 0.7079005 0 5 apple 1 12: -2.6720810 -2 7 apple 2 33: 0.1127583 0 5 banana 4 14: 2.3955292 2 9 banana 3 65: -0.1899854 0 7 orange 5 36: 1.5170863 1 9 orange 6 6
举例2:按照name分组计算number之和(没有索引也可以做)
使用索引
> setkey(dt, name)> dt[c("apple","banana","orange"), sum(number), by = .EACHI] name V11: apple 42: banana 73: orange 9> dt[c("apple","banana","orange"), sum(number)][1] 20
不使用索引
> dt[, sum(number), by =name] name V11: apple 42: banana 73: orange 9
组合data.table
按相同的列内容进行data.table组合
> dt_a + b = c("c","a","b"))> dt_a a b1: 1 c2: 2 a3: 3 b> dt_b + y = c("b","c","b"))> dt_b x y1: 3 b2: 2 c3: 1 b> dt_a[dt_b, on = .(b = y)] a b x1: 3 b 32: 1 c 23: 3 b 1
条件选择组合
> dt_a[dt_b, on = .(b = y)] a b c x z1: 3 b 6 3 42: 1 c 7 2 53: 2 a 5 1 8> dt_a[dt_b, on = .(b = y, c > z)] a b c x1: 3 b 4 32: 1 c 5 23: NA a 8 1
bind组合两个data.table
rbind()
> dt_a a b1: 1 c2: 2 a3: 3 b> dt_b a b1: 3 x2: 2 y3: 1 z> rbind(dt_a, dt_b) a b1: 1 c2: 2 a3: 3 b4: 3 x5: 2 y6: 1 z
cbind()
读取或写出文件
fread(".csv", select = c("a","b"))
读取.csv或.tsv格式的文件,可以选择特定列读取fwrite(dt, ".csv")
输出R环境中名为dt的数据框为.csv文件
foverlaps()
foverlaps()
格式
foverlaps(x, y, by.x = if (!is.null(key(x))) key(x) else key(y), by.y = key(y), maxgap = 0L, minoverlap = 1L, type = c("any", "within", "start", "end", "equal"), mult = c("all", "first", "last"), nomatch = getOption("datatable.nomatch", NA), which = FALSE, verbose = getOption("datatable.verbose"))
看两个数据框区域是否存在overlap,使用y作为索引去x中寻找有overlap的情况
> x = data.table(chr=c("Chr1", "Chr1", "Chr2", "Chr2", "Chr2"),+ start=c(5,10, 1, 25, 50), end=c(11,20,4,52,60))> x chr start end1: Chr1 5 112: Chr1 10 203: Chr2 1 44: Chr2 25 525: Chr2 50 60> y = data.table(chr=c("Chr1", "Chr1", "Chr2"), start=c(1, 15,1),+ end=c(4, 18, 55), geneid=letters[1:3])> y chr start end geneid1: Chr1 1 4 a2: Chr1 15 18 b3: Chr2 1 55 c> setkey(y, chr, start, end)> foverlaps(x, y, type="any") chr start end geneid i.start i.end1: Chr1 NA NA 5 112: Chr1 15 18 b 10 203: Chr2 1 55 c 1 44: Chr2 1 55 c 25 525: Chr2 1 55 c 50 60
type
type = "within"
只匹配y的区域完全包含在x的区域内的情况(相等也属于within)
type = "any"
匹配y和x有重叠的区域
type = "start"
匹配start一样的情况
type = "end"
匹配end一样的情况
...
- 其他
nomatch = NULL
返回匹配得上的部分
setkey()
设置匹配索引
参数which = TRUE
是只返回两个数据框匹配情况的行号
参数mult = "first"
是返回x中第一次匹配上的行
foverlaps(x, y, type="any", mult="first")
⚠️:如果x和y索引的列名称不同时,在foverlaps()
内加上一行参数
by.x =c("", "", "")
对应y中列的名称
数据的拆分和合并
melt()
dcast()
> reshape_dt price = c("3","8","4","6"), price2 = c("4","9","5","7"), level = c("h","l","h","l"))> reshape_dt kinds price price2 level1: peach 3 4 h2: peach 8 9 l3: grape 4 5 h4: grape 6 7 l> reshape_dt_new measure.vars = c("price", "price2"), variable.name = "2price", value.name = "money")> reshape_dt_new kinds level 2price money1: peach h price 32: peach l price 83: grape h price 44: grape l price 65: peach h price2 46: peach l price2 97: grape h price2 58: grape l price2 7> dcast(reshape_dt_new, kinds + level ~ `2price`, value.var = "money") kinds level price price21: grape h 4 52: grape l 6 73: peach h 3 44: peach l 8 9
参考资料
[1]查看代码: showcase.R