R爬虫小示例

爬取安居客首页楼盘信息,并将数据整合成数据框

library(RCurl)
ur1 <- "https://hz.fang.anjuke.com/?from=navigation/"
web <-  readLines(ur1,encoding ="UTF-8")
#筛选出楼盘名称
loupan <- web[grep("items-name",web)]
loupan1 <- substr(loupan,regexpr("\">",loupan)+2,nchar(loupan)-7)
head(loupan1)
#地址
area <- web[grep("list-map",web)]
area1 <- substr(area,regexpr("\">",area)+2,nchar(area)-7)
area2 <- strsplit(area1,split = "&nbsp;")
area3 <- unlist(area2)
area3_1 <- area3[seq(2,length(area3),5)]
area3_2 <- area3[seq(3,length(area3),5)]
area3_3 <- area3[seq(5,length(area3),5)]

#户型==================================================================
library(rvest)
huxing <- web[grep("diving-line",web)]
huxing
head(huxing,4)
huxing1 <- strsplit(huxing,split = " <em class=\"diving-line\"></em>")
huxing1
huxing2 <- 0
huxing3 <- 0
for(i in c(1:60)){
huxing2[i] <- strsplit(huxing1[[i]],split = "</span>")
huxing2[i]
d1 <- strsplit(huxing2[[1]],split = "<span>")
d2 <- unlist(d1)
d3 <- paste(d2,collapse = "")
d4 <- gsub(" ","",d3)
huxing3[i] <- d4 
}
head(huxing3)
#建筑面积========================================================================
proportion <- web[grep("building-area",web)]
proportion1 <- substr(proportion,regexpr("\">",proportion)+2,nchar(proportion)-7)
#销售状态和房屋性质===================================================================
status <- web[grep("status-icon",web)]
status
status1 <- substr(status,regexpr("\">",status)+2,nchar(status)-4)
status1
status2 <- status1[seq(1,120,2)]
status2
status3 <- status1[seq(2,120,2)]
status3
#房屋特点====================================================================
feature <- web[grep('class="tag" style="color',web)]
feature0<- substr(feature,regexpr("\">",feature)+2,nchar(feature)-7)#\"表示"
feature1 <- feature0[seq(1,300,5)]
feature2 <- feature0[seq(2,300,5)]
feature3 <- feature0[seq(3,300,5)]
feature4 <- feature0[seq(4,300,5)]
feature5 <- feature0[seq(5,300,5)]
#价格====================================================================
a1 <- grep("price-txt",web)
a2 <- grep("周边均价<span>",web)
a3 <- grep("class=\"price\"",web)
price1 <- web[a1]
price2 <- web[a2]
price3 <- web[a3]
price1_0 <- substr(price1,regexpr("\">",price1)+2,nchar(price1)-4)
price2_0 <- substr(price2,regexpr("周边均价",price2),regexpr("<span>",price2)-1)
price2_1 <- substr(price2,regexpr("<span>",price2)+6,regexpr("</span>",price2)-1)
price2_2 <- substr(price2,regexpr("</span>",price2)+7,nchar(price2)-4)
price_1 <- paste(paste(price2_0,":",price2_1),price2_2,sep="")
price_1 <- gsub(" ","",price_1)
price_1_1 <- paste(price1_0,price_1,sep="--")
price3_0 <- substr(price3,regexpr("\">",price3)+2,regexpr("<span>",price3)-1)
price3_1 <- substr(price3,regexpr("<span>",price3)+6,regexpr("</span>",price3)-1)
price3_2 <- substr(price3,regexpr("</span>",price3)+7,regexpr("</p>",price3)-1)
price_3 <- paste(price3_0,":",price3_1,price3_2,sep = "")
price<-vector(length=3523)
price[a1] <- price_1_1
price[a3] <- price_3
pricef <- price[price!="FALSE"]
#合并数据集===============================================================
data <- data.frame("楼盘"=loupan1,"区"=area3_1,"社区"=area3_2,
                   "地址"=area3_3,"户型"=huxing3,"建筑面积"=proportion1,
                   "销售状态"=status2,"房型性质"=status3,"房屋特点1"=feature1,
                   "房屋特点2"=feature2,"房屋特点3"=feature3,"房屋特点4"=feature4,
                   "房屋特点5"=feature5,"价格"=pricef)
View(data)

爬取浙商大新闻

#浙商大新闻===================================================================
url2 <- "http://www.hzic.edu.cn/"
web2 <- readLines(url2,encoding ="UTF-8")
title <- web2[grep("fz16",web2)]
title
title1 <- substr(title,regexpr("nk>",title)+3,nchar(title)-9)
title1
details <- web2[grep("<dd>",web2)]
details1 <- substr(details,regexpr("<dd>",details)+4,nchar(details)-5)
details1
data2 <- data.frame("标题"=title1,"摘要"=details1)

由于网站实时更新,以上代码不一定能爬取到信息,仅供参考.

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值