把以前在swirl课程学的基础数据清洗操作重新整理一遍,主要包括:
- dplyr包中的select、filter、arrange、mutate、group_by、summarize函数,以及%>% 管道操算符(pip operation),“then”的意思。
- tidyr包中的gather、seperate、spread函数。
- lubridate包中date型变量记录更新等操作。
1、dplyr包(select列选择、filter行选择、arrange排序、mutate新增、group_by分组统计、summarize汇总)
(1)首先读入数据并查看数据基本信息:
1 library(dplyr) 2 3 ### Intro 4 path2csv <- file.path('2014-07-08.csv') 5 df <- read.csv(path2csv, as.is = TRUE) 6 dim(df) 7 head(df) 8 cran <- tbl_df(df) 9 cran
(2)利用几个常用函数进行数据提取,筛选,排序等操作:
1 ##基本函数 2 select(cran,ip_id,package,country) #cran为数据集名称,ip_id,package,country为列名 3 select(cran,r_arch:country) #r_arch:country表示从r_arch列取到country列 4 select(cran,country:r_arch) #反过来排序 5 select(cran,-time) #不取time列 6 select(cran,-(X:size)) #不取X到size的列 7 8 filter(cran,package=="swirl") #cran为数据集,筛选package为”swirl“的所有行 9 filter(cran,r_version=="3.1.1",country=="US") #两个筛选条件 10 filter(cran,r_version<="3.0.2",country=="IN") 11 filter(cran,country=="US"|country=="IN") #添加正则表达式 12 filter(cran,!is.na(r_version)) #筛选所有r_version非空的行 13 14 arrange(cran2,ip_id) #按ip_id排升序(默认排序方式) 15 arrange(cran2,desc(ip_id)) #按ip_id排降序 16 arrange(cran2,package,ip_id) #先按package列排序,再对ip_id排序 17 18 mutate(cran3,size_mb=size/2^20) #利用已有列新建一列size_mb 19 mutate(cran3,size_mb=size/2^20,size_gb=size_mb/2^10) #新建俩列 20 21 summarise(cran3,avg_bytes=mean(size)) #汇总,查看整体统计数据
(3)分组统计并排序:
1 by_package<-group_by(cran,package) #按package列进行分组统计 2 summarise(by_package,mean(size)) #计算分组后每组记录的平均size 3 pack_sum <- summarize(by_package, #对分组数据by_package进行汇总统计 4 count =n(), #n()统计每组频数 5 unique = n_distinct(ip_id), #统计每组有多少不同的ip_id 6 countries = n_distinct(country), 7 avg_bytes = mean(size)) 8 9 quantile(pack_sum$unique, probs = 0.99) #计算unique的99%分位数 10 top_unique<-filter(pack_sum,unique>465) #筛选频数大于465的记录 11 top_unique_sorted<-arrange(top_unique,desc(unique)) 12 top_countries <- filter(pack_sum, countries > 60) 13 result1 <- arrange(top_countries, desc(countries), avg_bytes) 14 print(result1)
(4)利用嵌套精简(3)的过程:
1 result2 <- 2 arrange( 3 filter( 4 summarize( #分组数据用summarize进行汇总 5 group_by(cran, 6 package 7 ), 8 count = n(), 9 unique = n_distinct(ip_id), 10 countries = n_distinct(country), 11 avg_bytes = mean(size) 12 ), 13 countries > 60 14 ), 15 desc(countries), 16 avg_bytes 17 ) 18 19 print(result2)
(5)利用%>%符号操作(3)中的过程:
1 # you read it, you can pronounce the %>% operator as 2 # the word 'then'. 3 result3 <- 4 cran %>% 5 group_by(package) %>% 6 summarize(count = n(), 7 unique = n_distinct(ip_id), 8 countries = n_distinct(country), 9 avg_bytes = mean(size) 10 ) %>% 11 filter(countries > 60) %>% 12 arrange(desc(countries), avg_bytes) 13 14 # Print result to console 15 print(result3)
(6)几个函数及管道符号的一起使用:
1 cran %>% 2 select(ip_id, country, package, size) %>% 3 mutate(size_mb = size / 2^20) %>% 4 filter(size_mb <= 0.5) %>% 5 arrange(desc(size_mb)) %>% 6 print
2、tidyr包
(1)gather() 和 seperate():
1 library(tidyr) 2 library(readr) 3 library(dplyr) 4 5 students2 %>% 6 gather( sex_class,count ,-grade ) %>% 7 separate( sex_class, c("sex", "class")) %>% 8 print
下面的gather( students2,sex_class,count ,-grade )
#gather Gather columns into key-value pairs;students2为数据集,grade为第一列,不参与gather;剩下的列名及数据作为键值对放入sex_class(key)和count(value)下。
接下来seperate将sex_class列分成俩列。separate :Separate one column into multiple columns.
(2)spread(): # spread:Spread a key-value pair across multiple columns.
1 students3 %>% 2 gather(class, grade, class1:class5, na.rm = TRUE) %>% 3 #name未参与gather,因此照原格式输出 4 spread(test, grade) %>% 5 # spread:Spread a key-value pair across multiple columns. 6 mutate(class=parse_number(class)) %>% 7 # parse_numeric Extract numeric component of variable. 8 print
(3)行合并以及列合并:
1 student_info <- students4 %>% 2 select(id, name, sex) %>% 3 unique() %>% #去除重复记录 4 print 5 6 gradebook <- students4 %>% 7 select(id,class,midterm,final) %>% 8 print 9 10 merge.data.frame(gradebook,student_info,by="id") 11 #列合并不同行数的数据框 12 13 passed<-mutate(passed,status="passed") 14 failed<-mutate(failed,status="failed") 15 bind_rows(passed,failed) 16 #行合并俩个数据框
(4)几个函数一起使用:
1 sat %>% 2 select(-contains("total")) %>% 3 gather(part_sex, count, -score_range) %>% 4 separate(part_sex, c("part", "sex")) %>% 5 group_by(part,sex) %>% 6 mutate( total = sum(count), #统计每组频数 7 prop = count / total) %>% #统计每组里面各分数段比例 8 print
3、lubradate包
(1)获取当前日期时间
1 library(lubridate)
wday(today()) #get the day of the week,such that 1 = Sunday, 2 = Monday
(2)时间数据的录入
1 ymd("1989-05-17") 2 ymd("1989 May 17") 3 mdy("March 12,1975") 4 dmy(25081985) 5 ymd("1920/1/2") 6 7 hms("03:22:14") 8 ymd_hms(c("2010-04-14-04-35-59", "2010-01-01 12:00:00")) 9 ymd_hms(now())
(3)更新时间数据,计算时间间隔