dplyr高效数据处理包

Merry Christmas


目录

1,基本操作

2,高级操作

3,管道操作

4,apply家族函数


1,基本操作

-filter

-select

-arrange

-mutate

-distinct


数据加载

library(dplyr)
reserve<-read.csv("reserve.csv",header=T,stringsAsFactors=FALSE,sep=",",encoding="UTF-8")
apply<-read.csv("apply.csv",header = T,stringsAsFactors=FALSE,sep=",",encoding="UTF-8")
apply<-subset(apply,apply_persons>0)

filter

filter(apply,apply_persons>10)
filter(apply,apply_persons>10&airline=='MU')

select

a<-select(apply,fline,apply_persons)
b<-select(apply,fline:airline)
d<-select(apply,-c(fline:airline))
head(a)
head(b)
head(d)

arrange

e<-arrange(apply,fline,apply_persons)
f<-arrange(apply,desc(apply_persons),fline)
head(e)
head(f)

mutate

mutateapply<-mutate(apply,std=(apply_persons-min(apply_persons))/(max(apply_persons)/min(apply_persons)))
head(mutateapply)

distinct

n_distinct

y<-rep(1:3,c(1,2,3))
y;n_distinct(y)

distinct

df <- data.frame(
x = sample(5, 100, rep = TRUE),
y = sample(5, 100, rep = TRUE)
)
nrow(df)
nrow(distinct(df))
distinct(df, x)
distinct(df, y)

2,高级操作

-join

-group_by&summarise

-ranking


join

-inner_join

-left_join

-right_join

-full_join

-semi_join

-anti_join


full_join

reservedata<-full_join(reserve,apply,by=c("dt","fline","airline","interval","go_takeoff"))
head(reservedata,10)
reservedata$apply_persons[is.na(reservedata$apply_persons)]<-0
reservedata$reserveseat_cnt[is.na(reservedata$reserveseat_cnt)]<-0
reservedata$dt<-as.Date(reservedata$dt,format='%Y/%m/%d')
reservedata$airline<-factor(reservedata$airline)
reservedata$fline<-factor(reservedata$fline)
reservedata$go_takeoff<-factor(reservedata$go_takeoff)
reservedata$interval<-factor(reservedata$interval)

group_by&summarise

#按航线汇总
by_fline<-group_by(reservedata,fline)
sum_fline<-summarise(by_fline,count=n(),sumreserve=sum(reserveseat_cnt),sumapply=sum(apply_persons),
                     CR=sumapply/sumreserve) 
#优质航线
main_fline<-filter(sum_fline,CR>0.1&sumapply>90&sumreserve>400)
main_fline

ranking

-row_number(x)

-min_rank(x)

-dense_rank(x)


x<-rnorm(10,10,5)
x<-round(x,0)
row_number(x)
min_rank(x)
dense_rank(x)

3,管道操作

-%>%

#按航线汇总
by_fline<-reservedata%>%group_by(fline)%>%summarise(sumreserve=sum(reserveseat_cnt),sumapply=sum(apply_persons))%>% mutate(CR=sumapply/sumreserve)
#优质航线
main_fline<-by_fline%>%filter(CR>0.1&sumapply>90&sumreserve>400)

4,apply家族函数&向量化操作

-apply

-sapply

-lapply

-tapply


apply

apply(X, MARGIN, FUN, …)

data<-data.frame(name=c("Jack","Tom","Mary","Lily"),math=c(70,80,90,100),english=c(95,100,85,90))
apply(data[,-1],1,mean)
apply(data[,-1],2,sum)

sapply

sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)

summary(iris)

sapply(iris[,1:4],function(x) sd(x)/mean(x))

lapply

lapply(X, FUN, …)

lapply(iris[,1:4],sd)
as.data.frame(lapply(iris[,1:4],sd))
myfunc<-function(x){
  ret<-c(mean(x),sd(x))
  return(ret)
}
result<-lapply(iris[,1:4],myfunc)
as.data.frame(result)

tapply

tapply(X=iris$Sepal.Length,INDEX=list(iris$Species),FUN=mean)
with(iris,tapply(Sepal.Length,Species,mean))

END


Reference

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值