目录
1,基本操作
2,高级操作
3,管道操作
4,apply家族函数
1,基本操作
-filter
-select
-arrange
-mutate
-distinct
数据加载
filter
filter(apply,apply_persons>10)
filter(apply,apply_persons>10&airline=='MU')
select
a<-select(apply,fline,apply_persons)
b<-select(apply,fline:airline)
d<-select(apply,-c(fline:airline))
head(a)
head(b)
head(d)
arrange
e<-arrange(apply,fline,apply_persons)
f<-arrange(apply,desc(apply_persons),fline)
head(e)
head(f)
mutate
mutateapply<-mutate(apply,std=(apply_persons-min(apply_persons))/(max(apply_persons)/min(apply_persons)))
head(mutateapply)
distinct
n_distinct
y<-rep(1:3,c(1,2,3))
y;n_distinct(y)
distinct
df <- data.frame(
x = sample(5, 100, rep = TRUE),
y = sample(5, 100, rep = TRUE)
)
nrow(df)
nrow(distinct(df))
distinct(df, x)
distinct(df, y)
2,高级操作
-join
-group_by&summarise
-ranking
join
-inner_join
-left_join
-right_join
-full_join
-semi_join
-anti_join
full_join
reservedata<-full_join(reserve,apply,by=c("dt","fline","airline","interval","go_takeoff"))
head(reservedata,10)
reservedata$apply_persons[is.na(reservedata$apply_persons)]<-0
reservedata$reserveseat_cnt[is.na(reservedata$reserveseat_cnt)]<-0
reservedata$dt<-as.Date(reservedata$dt,format='%Y/%m/%d')
reservedata$airline<-factor(reservedata$airline)
reservedata$fline<-factor(reservedata$fline)
reservedata$go_takeoff<-factor(reservedata$go_takeoff)
reservedata$interval<-factor(reservedata$interval)
group_by&summarise
#按航线汇总
by_fline<-group_by(reservedata,fline)
sum_fline<-summarise(by_fline,count=n(),sumreserve=sum(reserveseat_cnt),sumapply=sum(apply_persons),
CR=sumapply/sumreserve)
#优质航线
main_fline<-filter(sum_fline,CR>0.1&sumapply>90&sumreserve>400)
main_fline
ranking
-row_number(x)
-min_rank(x)
-dense_rank(x)
x<-rnorm(10,10,5)
x<-round(x,0)
row_number(x)
min_rank(x)
dense_rank(x)
3,管道操作
-%>%
#按航线汇总
by_fline<-reservedata%>%group_by(fline)%>%summarise(sumreserve=sum(reserveseat_cnt),sumapply=sum(apply_persons))%>% mutate(CR=sumapply/sumreserve)
#优质航线
main_fline<-by_fline%>%filter(CR>0.1&sumapply>90&sumreserve>400)
4,apply家族函数&向量化操作
-apply
-sapply
-lapply
-tapply
apply
apply(X, MARGIN, FUN, …)
data<-data.frame(name=c("Jack","Tom","Mary","Lily"),math=c(70,80,90,100),english=c(95,100,85,90))
apply(data[,-1],1,mean)
apply(data[,-1],2,sum)
sapply
sapply(X, FUN, …, simplify = TRUE, USE.NAMES = TRUE)
summary(iris)
sapply(iris[,1:4],function(x) sd(x)/mean(x))
lapply
lapply(X, FUN, …)
lapply(iris[,1:4],sd)
as.data.frame(lapply(iris[,1:4],sd))
myfunc<-function(x){
ret<-c(mean(x),sd(x))
return(ret)
}
result<-lapply(iris[,1:4],myfunc)
as.data.frame(result)
tapply
tapply(X=iris$Sepal.Length,INDEX=list(iris$Species),FUN=mean)
with(iris,tapply(Sepal.Length,Species,mean))