利用R基础包
# 中层员工对上司的打分,q1-5为五个不同角度的评价
leadership <-data.frame(
manager = c(1,2, 3,4,5),
date = c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09"),
country=c("US","US","UK", "UK","UK"),
gender = c("M","F","F","M","F"),
age=c(32,45,25,39,156),
q1=c(4,5,5,3,2),
q2=c(5,3,3,3,2),
q3=c(5,2,5,4,1),
q4=c(5,5,5,NA,2),
q5=c(5,5,2,NA,1)
)
#处理异常值
#156岁明显是异常值
leadership$age[leadership$age>65]=NA
#处理缺失值:
#1. 要么移除包含缺失值的观测值(那一行)
# newdata<-na.cmit(leadership)
#2. 要么把缺失值替换
leadership$q4[is.na(leadership$q4)]=round(mean(leadership$q4,na.rm=TRUE)) #rm排除缺失值进行计算
leadership$q5[is.na(leadership$q5)]=round(mean(leadership$q5,na.rm=TRUE))
print(leadership)
# 添加变量:
# 希望计算出每个员工总评分和平均评分
# 1.法一
leadership$total_score = leadership$q1+leadership$q2+
leadership$q3+leadership$q4+leadership$q5
# 2.法二
leadership = transform(leadership,mean_score = (q1+q2+q3+q4+q5)/5)
print(leadership)
# 将数值型的年龄按照类别划分为3个顺序级变量
leadership$age[leadership$age>=60]<-'Elder'
leadership$age[leadership$age>=30 & leadership$age<60]<-'Middle Aged'
leadership$age[leadership$age<30]<-'Young'
print(leadership)
# 将时间字符串转换为时间类型Date
leadership$date <- as.Date(leadership$date,"%m/%d/%y") # "%m/%d%y"为待转换日期模板
# 改变变量名(列名)
names(leadership)[6:10]<-c('item1','item2','item3','item4','item5')
# 按照平均评分降序排序
leadershipBymscore <- leadership[order(-leadership$mean_score),] #order返回行索引,-代表降序
# 查看排序后的meanscore列和manager列
# print(leadershipBymscore[c('manager','mean_score')])
print(leadershipBymscore[,c('manager','mean_score')])
# 剔除年龄缺失值的数据并选择年纪中等的人的信息
leadershipBymscore<- leadershipBymscore[!is.na(leadershipBymscore$age),]
# 法一
print(leadershipBymscore[leadershipBymscore$age =='Middle Aged',])
# 法二subset()
print(subset(leadershipBymscore,age =='Middle Aged',select=manager:mean_score))
# 从表格所有行中不放回(replace)抽取2个样本,sample()返回行索引
print(leadershipBymscore[sample(1:nrow(leadershipBymscore),2,replace=FALSE),])
利用dplyr包
# 中层员工对上司的打分
leadership <-data.frame(
manager = c(1,2, 3,4,5),
date = c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09"),
country=c("US","US","UK", "UK","UK"),
gender = c("M","F","F","M","F"),
age=c(32,45,25,39,156),
q1=c(4,5,5,3,2),
q2=c(5,3,3,3,2),
q3=c(5,2,5,4,1),
q4=c(5,5,5,NA,2),
q5=c(5,5,2,NA,1)
)
library("dplyr")
#增加变量
leadership <-mutate(leadership,total_score=q1+q2+q3+q4,mean_score=(q1+q2+q3+q4+q5)/5)
# 重编码F、M
leadership$gender <-recode(leadership$gender,'M'='male','F'='female')
# 重命名变量名
leadership<- rename(leadership,ID =manager,sex=gender)
#根据性别和各性别总分就行排序
leadership<-arrange(leadership,sex,total_score)# 倒叙参数使用desc(total_score)
# 选择列
newD<-select(leadership,ID,sex,mean_score)#-sex代表剔除变量、还有starts_with=""
# 有条件的选择行
newD1<-filter(leadership,sex=='male')