library(xml2)
library(rvest)
site1 <- "https://www.zhipin.com/c101280600/h_101280600/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page=";
site2 <- "&ka=page-";
page <- 1;
total<-data.frame(name=character(),salary=character(),city=character(), experience=character(),degree=character());
for(page in 1:30){
site <- paste(site1,page,site2,page,sep="");
html <- read_html(site);
#jobName<-html_nodes(html,"div.info-primary>h3.name")%>%html_text(trim=T);
#temp<-strsplit(jobName," ");
#后来发现这样处理不行,因为有些职务名称里本身就带有空格
job<-html_nodes(html,"div.info-primary>h3.name")
job<-gsub("<h3 class=\"name\">","",job);
jobName<-gsub("<(span.*?)(class.*?)>(.*?)</h3>"," ",job);
jobName
salary<-html_nodes(html,"div.info-primary>h3.name>span.red")%>%html_text(trim=T);
salary
df1 <- data.frame(jobName,salary);
jobMsg<-html_nodes(html,"div.info-primary>p");
jobMsg<-gsub("<(em.*?)(class.*?)></em>"," ",jobMsg);
jobMsg<-gsub("<(.?p)>","",jobMsg);
temp<-strsplit(jobMsg," ");
df2 <- data.frame(matrix(unlist(temp), nrow=15, byrow=T));
df<-data.frame(df1,df2);
total<-rbind(total,df);
}
R 爬虫入门
最新推荐文章于 2022-11-17 20:51:55 发布