回归分析:
############# 设置工作路径****
## 请使用setwd函数,设置自己的工作路径,并将上述所提到的文件放到该工作路径下***
## setwd(.....)
## package: ggplot2 用于绘制各类图表
library(ggplot2)
load("temp.rda") ## 调用描述性统计“descriptive statsics.R”的中间运行结果
## 图片颜色设置,利用rgb函数产生颜色,用于后续画图使用
col1 = rgb(32,40,51,maxColorValue = 255) ##某一种黑色
col2 = rgb(172,22,34,maxColorValue = 255) ##某一种深红
col3 = "indianred " ##某一种浅红
col4 = "dimgrey" ##某一种灰色
############ 回归分析
## 加虚拟变量,我们采用ifelse条件判断语句来创建虚拟变量,以北京为例,若地区为北京,则北京这个变量为1,反之为0
## 地区
jobinfo.new$北京 = ifelse(jobinfo$地区=="北京",1,0)
jobinfo.new$河北 = ifelse(jobinfo$地区=="河北",1,0)
jobinfo.new$上海 = ifelse(jobinfo$地区=="上海",1,0)
jobinfo.new$深圳 = ifelse(jobinfo$地区=="深圳",1,0)
jobinfo.new$山西 = ifelse(jobinfo$地区=="山西",1,0)
jobinfo.new$陕西 = ifelse(jobinfo$地区=="陕西",1,0)
## 公司类别
jobinfo.new$合资 = ifelse(jobinfo$公司类别=="合资",1,0)
jobinfo.new$外资 = ifelse(jobinfo$公司类别=="外资",1,0)
jobinfo.new$上市公司 = ifelse(jobinfo$公司类别=="上市公司",1,0)
jobinfo.new$民营公司 = ifelse(jobinfo$公司类别=="民营公司",1,0)
jobinfo.new$国企 = ifelse(jobinfo$公司类别=="国企",1,0)
jobinfo.new$非营利机构 = ifelse(jobinfo$公司类别=="非营利机构",1,0)
jobinfo.new$创业公司 = ifelse(jobinfo$公司类别=="创业公司",1,0)
jobinfo.new$事业单位 = ifelse(jobinfo$公司类别=="事业单位",1,0)
## 公司规模
jobinfo.new$少于50人 = ifelse(jobinfo$公司规模=="少于50人",1,0)
jobinfo.new$"50-150人" = ifelse(jobinfo$公司规模=="50-150人",1,0)
jobinfo.new$"150-500人" = ifelse(jobinfo$公司规模=="150-500人",1,0)
jobinfo.new$"500-1000人" = ifelse(jobinfo$公司规模=="500-1000人",1,0)
jobinfo.new$"1000-5000人" = ifelse(jobinfo$公司规模=="1000-5000人",1,0)
jobinfo.new$"5000-10000人" = ifelse(jobinfo$公司规模=="5000-10000人",1,0)
jobinfo.new$"10000人以上" = ifelse(jobinfo$公司规模=="10000人以上",1,0)
## 学历
jobinfo.new$中专 = ifelse(jobinfo$学历=="中专",1,0)
jobinfo.new$高中 = ifelse(jobinfo$学历=="高中",1,0)
jobinfo.new$大专 = ifelse(jobinfo$学历=="大专",1,0)
jobinfo.new$本科 = ifelse(jobinfo$学历=="本科",1,0)
jobinfo.new$硕士 = ifelse(jobinfo$学历=="硕士",1,0)
jobinfo.new$博士 = ifelse(jobinfo$学历=="博士",1,0)
## 要求经验
jobinfo.new$经验要求 = jobinfo$经验
## 回归中,地区以河北为基准,公司类别以国企为基准,公司规模以少于50人为基准,学历以无为基准
lm.fit = lm(对数平均薪资~.-河北-国企-少于50人,data = jobinfo.new)
## 查看回归结果
summary(lm.fit)
## 创建显著性向量
sign = c("***","*","","***","**","","**","***","**","","","","***",
"***","***","***","**","**","***","","***","","","**","",
"","***","","*","","","***","***","***","***","***","***",
"***")
significance = paste(as.character(round(coef(lm.fit),3)),sign,sep = "")
## 将各类回归系数放到同一个dataframe,以软件为例,将软件系数用coef取出,然后放入software.coef中;
## 取出software.coef的行名,作为变量“软件要求”,给列命名并去除行名
## 学历dataframe
edu.coef = data.frame(coef(lm.fit)[32:37],significance[32:37])
edu.coef$地区 = factor(row.names(edu.coef),
levels = row.names(edu.coef))
colnames(edu.coef) = c("系数","显著性水平","学历")
row.names(edu.coef) = NULL
## 学历系数可视化,画出学历的回归系数直方图
ggplot(data=edu.coef, aes(x=学历, y=系数)) +
geom_bar(fill=c(rep(col4,5),col3),stat="identity",width = 0.6) +
annotate("text",x=1:6,y=c(edu.coef$系数[1:3]-0.02,edu.coef$系数[4:6]+0.02),label=edu.coef$显著性水平) +
labs(x="学历要求")
## 软件系数可视化,画出地区的回归系数直方图
software.coef = data.frame(coef(lm.fit)[2:13],significance[2:13]) ## 将软件系数用coef取出,然后放入software.coef中
software.coef$软件技能 = factor(row.names(software.coef),
levels = row.names(software.coef)[order(software.coef$coef.lm.fit..2.13.)]) ##取出software.coef的行名,作为变量“软件技能”,并转化为因子变量
colnames(software.coef) = c("系数","显著性水平","软件要求")
row.names(software.coef) = NULL
## 按照系数大小重新排列software.coef
software.coef = software.coef[order(software.coef$系数),]
## 软件系数可视化,画出软件的回归系数直方图
ggplot(data=software.coef, aes(x=软件要求, y=系数)) +
geom_bar(fill=c(col3,rep(col4,10),col3),stat="identity",width=0.6) +
annotate("text",x=1:12,y=c(software.coef$系数
[1:5]-0.007,software.coef$系数[6:12]+0.007),label=software.coef$显著性水平)
## 地区系数dataframe
district.coef = data.frame(coef(lm.fit)[14:18],significance[14:18])
district.coef$地区 = factor(row.names(district.coef),
levels = row.names(district.coef)[order(district.coef$coef.lm.fit..14.18.)])
colnames(district.coef) = c("系数","显著性水平","地区")
row.names(district.coef) = NULL
## 地区系数可视化,画出地区的回归系数直方图
ggplot(data=district.coef, aes(x=地区, y=系数)) +
geom_bar(fill=c(rep(col4,2),rep(col3,3)),stat="identity",width=0.6) +
geom_text(label=district.coef$显著性水平,vjust = -0.4,size=5)
## 公司规模dataframe
scale.coef = data.frame(coef(lm.fit)[26:31],significance[26:31])
scale.coef$公司规模 = c("50-150人","150-500人","500-1000人","1000-5000人","5000-10000人","10000人以上")
scale.coef$公司规模 = factor(scale.coef$公司规模,levels = scale.coef$公司规模)
colnames(scale.coef) = c("系数","显著性水平","公司规模")
row.names(scale.coef) = NULL
## 公司规模系数可视化,画出公司规模的回归系数直方图
ggplot(data=scale.coef, aes(x=公司规模, y=系数)) +
geom_bar(fill=c(col4,col3,rep(col4,4)),stat="identity",width = 0.6) +
geom_text(label=scale.coef$显著性水平,vjust = -0.4,size=5)
## 公司类别dataframe
type.coef = data.frame(coef(lm.fit)[19:25],significance[19:25])
type.coef$地区 = factor(row.names(type.coef),
levels = row.names(type.coef)[order(type.coef$coef.lm.fit..19.25.)])
colnames(type.coef) = c("系数","显著性水平","公司类别")
row.names(type.coef) = NULL
type.coef = type.coef[order(type.coef$系数),]
## 公司类别系数可视化,画出公司类别的回归系数直方图
ggplot(data=type.coef, aes(x=公司类别, y=系数)) +
geom_bar(fill=c(rep(col4,6),col3),stat="identity",width = 0.6) +
annotate("text",x=1:7,y=c(type.coef$系数
[1:2]-0.007,type.coef$系数[3:7]+0.007),label=type.coef$显著性水平)
## 预测:会用r和python,本科毕业,无工作经验,公司位于上海,规模87人,上市公司
## 创建一个名为new.data1的data frame (职场菜鸟)
new.data1 = matrix(c(1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0),1,40)
new.data1 = as.data.frame(new.data1)
colnames(new.data1) = names(jobinfo.new)[-1] ## 对data frame命名
interval1 = predict(lm.fit,new.data1,interval="confidence") ## 区间估计
income1 = exp(interval1) ##将预测的对数薪资转化为实际薪资
income1
## 预测:会用r,java,sas和python,硕士毕业,
## 7年工作经验,公司位于北京,中小型公司(规模150-500人),创业公司
## 创建一个名为new.data2的data frame (职场高富帅)
new.data2 = matrix(c(1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,7),1,40)
new.data2 = as.data.frame(new.data2)
colnames(new.data2) = names(jobinfo.new)[-1] ## 对data frame命名
interval2 = predict(lm.fit,new.data2,interval="confidence") ## 区间估计
income2 = exp(interval2) ##将预测的对数薪资转化为实际薪资
income2