R网络爬虫之酒店团购

最新推荐文章于 2024-03-01 19:02:09 发布

honeyasong

最新推荐文章于 2024-03-01 19:02:09 发布

阅读量1.3k

点赞数

分类专栏： R 文章标签： R 网络爬虫

本文链接：https://blog.csdn.net/asongsongsong/article/details/45562687

版权

R 专栏收录该内容

73 篇文章 3 订阅

订阅专栏

此次抓取的为某团购网站西安地区的前n页酒店信息。

library(RCurl)
library(XML)
myheader=c(
  "User-Agent"="Mozilla/4.0(compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0) ",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7")
#设置报头伪装成浏览器。因为有些服务器有限制

urllist=0
page=0:10
urllist[page]=paste("http://t.dianping.com/hotel/xian-category_4&pageIndex=",page,sep="")
jishuqi=0
total_1=0
total_2=0
for(turn in urllist)#每次循环抓取一次页面（共抓5页）
{ 
  temp=getURL(turn,httpheader=myheader,encoding="UTF-8")
  
  
  k=htmlParse(temp)#解析
  # getNodeSet(k,'//li[@class="tg-floor-item"]')通过xPath定位，返回信息为list类型；
  youhui=sapply(getNodeSet(k,'//li[@class="tg-floor-item"]'),xmlValue)
  #youhui为字符串类型,每条信息为一个串
  
  mydata=youhui
  mydata
  mydata=gsub("([N ])", "", mydata) #去除空格
  mydata
  
  ############################################酒店名称#################################################
  name=gregexpr("\\t[0-9]?[\u4e00-\u9fa5]+[0-9]*[\u4e00-\u9fa5]*[\u5e97|\u9986|\u95f4|\u623f|\u820d]",mydata)
  #后四个编码为店，馆，间，房，舍
  a=c()
  for(i in 1:length(mydata))  
  {  
    temp=name[[i]]
    a[i]=substring(mydata[i],temp[1]+1,temp[1]+attr(temp,'match.length')-1)
  } 
  ############################################[提示信息]#################################################
  tip=gregexpr("\\[.+\\]",mydata)
  b=c()
  for(i in 1:length(mydata))  
  {  
    temp=tip[[i]]
    b[i]=substring(mydata[i],temp[1]+1,temp[1]+attr(temp,'match.length')-2)#提示信息
  } 
  ############################################详细说明#################################################
  remark=gregexpr("]\\n",mydata)
  #向后取100+位置，然后确定详细信息中的首次出现的字母，将其后边剔除
  c=c()
  for(i in 1:length(mydata))  
  {  
    temp=remark[[i]]
    c[i]=substring(mydata[i],temp[1]+2,temp[1]+attr(temp,'match.length')+100)
    pos=gregexpr("\\n\\t",c[i])[[1]]
    c[i]=substring(c[i],1,pos[1]-1)
  } 
  #############################################已售################################################
  num=gregexpr("[\u5df2][\u552e][0-9]+",mydata)
  #"已售"的unicode为[\u5df2][\u552e]
  d=c()
  for(i in 1:length(mydata))  
  {  
    temp=num[[i]]
    d[i]=substring(mydata[i],temp[1]+2,temp[1]+attr(temp,'match.length')-1)
  } 
  d=as.numeric(d)
  ############################################打折价和原价#################################################
  price=gregexpr("[\u00A5][0-9]+",mydata)
  #\u00A5为人民币符号的unicode
  x=c()
  y=c()
  for(i in 1:length(mydata))  
  {  
    temp=price[[i]]
    x[i]=substring(mydata[i],temp[1]+1,temp[1]+attr(temp,'match.length')[1]-1)#打折价
    y[i]=substring(mydata[i],temp[2]+1,temp[2]+attr(temp,'match.length')[2]-1)#原价
  } 
  x=as.numeric(x)
  y=as.numeric(y)
  #############################################评分################################################
  grade=gregexpr("[0-9][\u5206]",mydata)
  #\u5206为‘分’的编码
  e=c()
  for(i in 1:length(mydata))  
  {  
    temp=grade[[i]]
    e[i]=substring(mydata[i],temp[1]-2,temp[1])
  } 
  e=as.numeric(e)
  e
  #############################################评论数量################################################
  comment=gregexpr("[0-9]+[\u6761][\u8bc4]",mydata)
  f=c()
  for(i in 1:length(mydata))  
  {  
    temp=comment[[i]]
    f[i]=substring(mydata[i],temp[1],temp[1]+attr(temp,'match.length')[1]-3)
  }
  f=as.numeric(f)
  #####################################################################################################
  #此三行表示将所有数据存在total_2中
  info=data.frame(a,b,c,x,y,d,e,f)
  total_1=rbind(total_2,info)
  total_2=total_1
  
  #写入文件
  #下面三行为每次抓的分别存
  # wenjianming=paste(jishuqi,".txt",sep="")
  # write.table(info,wenjianming,quote=FALSE)
  # jishuqi=jishuqi+1  
  
  #存在一个文件中
  #write.table(info,"hotle.txt",quote=FALSE,append=TRUE)
  
}

fix(total_2)
#write.table(total_2,"hotle.txt",quote=FALSE,append=TRUE)

honeyasong

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
R网络爬虫之酒店团购

此次抓取的为某团购网站西安地区的第一页酒店信息。有一个问题就是在R中，像酒店名称为中文，但写入.txt之后就会乱码（unicode），不知怎么解决这个问题。 library(RCurl)library(XML)myheader=c( "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.
复制链接

扫一扫