R爬虫小示例

最新推荐文章于 2024-03-11 09:33:01 发布

shushujiuhui123

最新推荐文章于 2024-03-11 09:33:01 发布

阅读量320

点赞数 1

分类专栏： R语言文章标签： r语言爬虫开发语言

本文链接：https://blog.csdn.net/shushujiuhui123/article/details/121151508

版权

R语言专栏收录该内容

16 篇文章 5 订阅

订阅专栏

该博客展示了如何使用R语言进行网页爬虫，分别从安居客网站获取房产信息，包括楼盘名称、地址、户型等，并整合成数据框。同时，也演示了从浙商大官网抓取新闻标题和摘要。通过这些步骤，读者可以学习到R语言在网络爬虫方面的应用，以及如何处理和组织爬取到的数据。

摘要由CSDN通过智能技术生成

爬取安居客首页楼盘信息，并将数据整合成数据框

library(RCurl)
ur1 <- "https://hz.fang.anjuke.com/?from=navigation/"
web <-  readLines(ur1,encoding ="UTF-8")
#筛选出楼盘名称
loupan <- web[grep("items-name",web)]
loupan1 <- substr(loupan,regexpr("\">",loupan)+2,nchar(loupan)-7)
head(loupan1)
#地址
area <- web[grep("list-map",web)]
area1 <- substr(area,regexpr("\">",area)+2,nchar(area)-7)
area2 <- strsplit(area1,split = "&nbsp;")
area3 <- unlist(area2)
area3_1 <- area3[seq(2,length(area3),5)]
area3_2 <- area3[seq(3,length(area3),5)]
area3_3 <- area3[seq(5,length(area3),5)]

#户型==================================================================
library(rvest)
huxing <- web[grep("diving-line",web)]
huxing
head(huxing,4)
huxing1 <- strsplit(huxing,split = " <em class=\"diving-line\"></em>")
huxing1
huxing2 <- 0
huxing3 <- 0
for(i in c(1:60)){
huxing2[i] <- strsplit(huxing1[[i]],split = "</span>")
huxing2[i]
d1 <- strsplit(huxing2[[1]],split = "<span>")
d2 <- unlist(d1)
d3 <- paste(d2,collapse = "")
d4 <- gsub(" ","",d3)
huxing3[i] <- d4 
}
head(huxing3)
#建筑面积========================================================================
proportion <- web[grep("building-area",web)]
proportion1 <- substr(proportion,regexpr("\">",proportion)+2,nchar(proportion)-7)
#销售状态和房屋性质===================================================================
status <- web[grep("status-icon",web)]
status
status1 <- substr(status,regexpr("\">",status)+2,nchar(status)-4)
status1
status2 <- status1[seq(1,120,2)]
status2
status3 <- status1[seq(2,120,2)]
status3
#房屋特点====================================================================
feature <- web[grep('class="tag" style="color',web)]
feature0<- substr(feature,regexpr("\">",feature)+2,nchar(feature)-7)#\"表示"
feature1 <- feature0[seq(1,300,5)]
feature2 <- feature0[seq(2,300,5)]
feature3 <- feature0[seq(3,300,5)]
feature4 <- feature0[seq(4,300,5)]
feature5 <- feature0[seq(5,300,5)]
#价格====================================================================
a1 <- grep("price-txt",web)
a2 <- grep("周边均价<span>",web)
a3 <- grep("class=\"price\"",web)
price1 <- web[a1]
price2 <- web[a2]
price3 <- web[a3]
price1_0 <- substr(price1,regexpr("\">",price1)+2,nchar(price1)-4)
price2_0 <- substr(price2,regexpr("周边均价",price2),regexpr("<span>",price2)-1)
price2_1 <- substr(price2,regexpr("<span>",price2)+6,regexpr("</span>",price2)-1)
price2_2 <- substr(price2,regexpr("</span>",price2)+7,nchar(price2)-4)
price_1 <- paste(paste(price2_0,":",price2_1),price2_2,sep="")
price_1 <- gsub(" ","",price_1)
price_1_1 <- paste(price1_0,price_1,sep="--")
price3_0 <- substr(price3,regexpr("\">",price3)+2,regexpr("<span>",price3)-1)
price3_1 <- substr(price3,regexpr("<span>",price3)+6,regexpr("</span>",price3)-1)
price3_2 <- substr(price3,regexpr("</span>",price3)+7,regexpr("</p>",price3)-1)
price_3 <- paste(price3_0,":",price3_1,price3_2,sep = "")
price<-vector(length=3523)
price[a1] <- price_1_1
price[a3] <- price_3
pricef <- price[price!="FALSE"]
#合并数据集===============================================================
data <- data.frame("楼盘"=loupan1,"区"=area3_1,"社区"=area3_2,
                   "地址"=area3_3,"户型"=huxing3,"建筑面积"=proportion1,
                   "销售状态"=status2,"房型性质"=status3,"房屋特点1"=feature1,
                   "房屋特点2"=feature2,"房屋特点3"=feature3,"房屋特点4"=feature4,
                   "房屋特点5"=feature5,"价格"=pricef)
View(data)

爬取浙商大新闻

#浙商大新闻===================================================================
url2 <- "http://www.hzic.edu.cn/"
web2 <- readLines(url2,encoding ="UTF-8")
title <- web2[grep("fz16",web2)]
title
title1 <- substr(title,regexpr("nk>",title)+3,nchar(title)-9)
title1
details <- web2[grep("<dd>",web2)]
details1 <- substr(details,regexpr("<dd>",details)+4,nchar(details)-5)
details1
data2 <- data.frame("标题"=title1,"摘要"=details1)

由于网站实时更新，以上代码不一定能爬取到信息，仅供参考.

shushujiuhui123

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
R爬虫小示例

爬取安居客首页楼盘信息，并将数据整合成数据框library(RCurl)ur1 <- "https://hz.fang.anjuke.com/?from=navigation/"web <- readLines(ur1,encoding ="UTF-8")#筛选出楼盘名称loupan <- web[grep("items-name",web)]loupan1 <- substr(loupan,regexpr("\">",loupan)+2,nchar(loupan
复制链接

扫一扫

专栏目录