今天代码中,我们在之前链家静态爬虫的基础上,使用RSelenium在链家官网检索“上海交通大学”,并爬取相关搜索结果。
关于Selenium 和Docker的介绍,可以去看电子讲义(https://dongboshi.github.io/Data-Engineering/)。
if(!require(RSelenium)){ install.packages("RSelenium")}library(RSelenium)library(XML)library(RCurl)library(stringr)# 使用前面静态爬虫定义的函数gettp function(x,j){ return(str_trim(unlist(str_split(x,pattern = "\\|"))[j]))}# 安装docker# https://docs.docker.com/docker-for-windows/install/# docker run -d -p 4445:4444 selenium/standalone-firefox:2.53.1# docker run -d -p 5901:5900 -p 127.0.0.1:4445:4444 --link http-server selenium/standalone-firefox-debug:2.53.1# 设定(虚拟)浏览器remDr remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")# 打开浏览器remDr$open()# 打开网页url "https://sh.lianjia.com/ershoufang/"remDr$navigate(url)# 定位搜索栏目xpath '//input[(@id = "searchInput")]'btn 'xpath', value = xpath)# 搜索交通大学text "交通大学", key = btn$sendKeysToElement(text)# 查看搜索结果remDr$maxWindowSize()remDr$screenshot(display = TRUE)# 得到网页源码并解析file 1]]parsed_doc # 得到相关房价信息community '//a[@]',xmlValue)totalprice '//*[contains(concat( " ", @class, " " ), concat( " ", "totalPrice", " " ))]//span',xmlValue)type_all '//*[contains(concat( " ", @class, " " ), concat( " ", "houseInfo", " " ))]',xmlValue)type function(x) gettp(x,1)))size unlist(year unlist(avgprice xpathSApply(result type,size,year,avgprice)# 点击2页pg "xpath",value = #pg$highlightElement()pg$clickElement()remDr$getCurrentUrl()# 重复上述操作,但是在这之前我们需要知道总共有多少页需要爬page as.numeric(str_extract(xpathSApply(parsed_doc, '/html/body/div[4]/div[1]/div[8]/div[2]/div', xmlGetAttr,"page-data"), pattern = "[0-9]+"))# 在这个基础上跑循环house for(i in 1:page){ pg "xpath",value = paste0( pg$clickElement() # 得到网页源码并解析 file parsed_doc # 得到相关房价信息 community '//a[@]',xmlValue) totalprice '//*[contains(concat( " ", @class, " " ), concat( " ", "totalPrice", " " ))]//span',xmlValue) type_all '//*[contains(concat( " ", @class, " " ), concat( " ", "houseInfo", " " ))]',xmlValue) type function(x) gettp(x, size function(x) gettp(x, year function(x) gettp(x, avgprice '//*[contains(concat( " ", @class, " " ), concat( " ", "unitPrice", " " ))]//span',xmlValue) result type,size,year,avgprice) Sys.sleep(2) house cat("第",i,"页爬成功","\n")}