这个办法可能比较原始,但是部分解决了我的问题。第一个循环生成第一页的数据集。后面第二个循环,从第二页到第20页,把取到的数据合并到第一个数据集中。
现在的问题是汉字乱码,只好不收集汉字信息。
install.packages("rvest")
library(rvest)
rm(iDataAll)
for(i in 1:1){
gurl <- paste0("http://bj.ganji.com/fang5/haidian/o",i,"/")
print(gurl)
tmp <- gurl %>% html_session %>% read_html(encoding="utf-8") %>% html_nodes("div.f-main-list>div>div")
# 面积
iArea <- tmp %>% html_nodes("dl>dd[data-huxing]") %>% html_attr("data-area") %>%
gsub(pattern="[^0-9]",replacement="")
# 筛选朝向等数据
iTmp <- tmp %>% html_nodes("dl>dd[data-huxing]>span") %>% html_text
# 提取价格
iPrice <- tmp %>% html_nodes("dl>dd>div.price>span:first-child") %>% html_text
# 提取单价
iTime <- tmp %>% html_nodes("dl>dd>div.time") %>% html_text %>%
gsub(pattern="[^0-9]",replacement="") %>% as.numeric
# 合并数据框
iDataAll <- data.frame(
iArea=iArea,
iPrice=iPrice,
iTime=iTime,
stringsAsFactors=FALSE)
}
for(i in 2:20){
gurl <- paste0("http://bj.ganji.com/fang5/haidian/o",i,"/")
print(gurl)
tmp <- gurl %>% html_session %>% read_html(encoding="utf-8") %>% html_nodes("div.f-main-list>div>div")
# 面积
iArea <- tmp %>% html_nodes("dl>dd[data-huxing]") %>% html_attr("data-area") %>%
gsub(pattern="[^0-9]",replacement="")
# 筛选朝向等数据
iTmp <- tmp %>% html_nodes("dl>dd[data-huxing]>span") %>% html_text
# 提取价格
iPrice <- tmp %>% html_nodes("dl>dd>div.price>span:first-child") %>% html_text
# 提取单价
iTime <- tmp %>% html_nodes("dl>dd>div.time") %>% html_text %>%
gsub(pattern="[^0-9]",replacement="") %>% as.numeric
# 合并数据框
iData <- data.frame(
iArea=iArea,
iPrice=iPrice,
iTime=iTime,
stringsAsFactors=FALSE)
iDataAll <- rbind(iDataAll, iData)
}
iDataAll
summary(iDataAll)