利用Rcurl包做的一个小爬虫,爬取了京东上电热水器的评论
<span style="font-family: Arial, Helvetica, sans-serif;">#利用Rcurl抓取京东页面上电热水器的评论</span>
library(RCurl)
library(XML)
library(plyr)
#要爬取数据的(京东)网址,共有56页
page <- 1:56
urlist <- paste("http://club.jd.com/allconsultations/1121567-",page,"-1.html",sep="")
#伪造请求报头
myheader=c("User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
#下载网址
webpage = getURL(urlist,httpheader=myheader,.encoding='utf-8')
#解析xml文