#参考了http://blog.csdn.net/qq_34773726/article/details/72546163?locationNum=12&fps=1
library(stringr)
library(xml2)
library(rvest)
#构造容器
data.frame()->alldf
vector()->keywords_final
#查看网页结构之后编辑相关的url网址
site="https://www.zhipin.com/c101020100/h_101020100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page="
#设置循环,网页一共只有20页可读
for(page in 1:20)
{
site=paste(site,page,sep="")
webpage=read_html(site)
#工作job
html_nodes(webpage,'.info-primary .name') %>% html_nodes("a")->job1 #XPath找到节点
str_extract(job1,">.+<span")->job2 #正则表达式提取目标内容
str_sub(job2,start=2,end=-6)->job #删除非目标内容
#公司company
html_nodes(webpage,'.info-company .company-text .name')%>% html_node("a")->company1
str_extract(company1,">.+</a")->company2
str_sub(company2,start=2,end=-4)->company
#工资salary
html_nodes(webpage,'.info-primary .name .red') ->salary1
str_extract(salary1,">.+</span")->salary2
str_sub(salary2,start=2,end=-7)->salary
#工作地点location
html_nodes(webpage,'.job-primary .info-primary') %>% html_node("p") ->location1
str_extract(location1,"p>.+?<em")->location2
str_sub(location2,start=3,end=-4)->location
#工作经验experience
str_extract(location1,"/em>.+?<em")->exp2
str_sub(exp2,start=5,end=-4)->experience
#学历education
str_extract(location1,"/em>(?=((?!/em>).)*$).+?</p")->edu2 #c(?=((?!c).)*$) 可以匹配最后一个出现的字符c
str_sub(edu2,start=5,end=-4)->education
#公司类型company_type
html_nodes(webpage,".info-company .company-text")%>% html_node("p")->ct1
str_extract(ct1,"p>.+?<em")->ct2
str_sub(ct2,start=3,end=-4)->company_type
#经济状况economy_state
str_extract(ct1,"/em>.+?<em")->es2
str_sub(es2,start=5,end=-4)->economy_state
#公司规模size
str_extract(ct1,"/em>(?=((?!/em>).)*$).+?</p")->size2
str_sub(size2,start=5,end=-4)->size
#发布时间date
html_nodes(webpage,".job-time .time")->date1
str_extract(date1,"发布于.+?<")->date2
str_sub(date2,start=4,end=-2)->date
#关键词keywords
html_nodes(webpage,".job-tags") %>% html_nodes("span")->keywords1
str_extract(keywords1,">.+<")->keywords2
str_sub(keywords2,start=2,end=-2)->keywords
cbind(job,company,salary,location,experience,education,company_type,economy_state,size,date)->newdf #将目标列合并
rbind(alldf,newdf)->alldf #将循环内容装入
c(keywords_final,keywords)->keywords_final #关键字没有结构化因此弄成向量分析
}
write.csv(alldf,file="alldf.csv") #读出文件
write.csv(keywords_final,file="keywords.csv")
table(alldf$salary)
#改进的方案:html_text()可以直接取得节点中所有的文本信息,不过其实本案例的xpath+正则表达的可移植性更强