在爬取某网站的评论时 ,评论和一些是ajax请求的,而且不是很好找,因为有个eleven(想知道的百度"eleven参数")参数很难搞(之前搞出过一次)。
- 用Selenium只要有下面(目前技术也只能用这个)
- 因为评论是ajax请求后再添加到页面的,就是在客户端上渲染的,如果之前使用requests的get获取到的只能是渲染前的,没有评论数据
- Selenium get 后可以 用 page_source 来获取源码,这里的page_source是js在本地渲染完后的,可以直接获取后用xpath解析
-
缺点:
- 速度不快(网站有验证,爬太快反而被封)
-
Selenium 坑总结
- 浏览器设置无头模式(headless)
- 设置浏览器不加载图片,不加载css(提高速度
- 设置页面加载和脚本执行的超时时间
爬虫代码,加载浏览器部分
#加载浏览器
def loadDriver(self):
option = webdriver.FirefoxOptions()
#无头模式
option.add_argument("-headless")
#禁止加载图片
option.set_preference('permissions.default.image', 2)
#禁止加载css样式表
option.set_preference('permissions.default.stylesheet',2)
self.firefox = webdriver.Firefox(options=option)
#设置页面加载超时,超过这个时间就会抛出异常,执行之后的代码,不然会卡在>一直加载
self.firefox.set_page_load_timeout(5)
self.firefox.set_script_timeout(5)
try:
#超时会抛出异常,这里try一下
self.firefox.get("https://hotels.ctrip.com/hotel/479628.html")
except Exception as e:
pass
这里的 permissions.default.image 参数是 可以在火狐浏览器的config里找到的,但这个是没找到的permissions.default.stylesheet
代码里的citys.txt和对应的hotel_id.txt,是之前爬的, 这些都可以直接爬,没有什么反爬 (city是爬取移动版网页时候的)
爬虫代码
from selenium import webdriver
import os
import json
import time
import json
from lxml import etree
#http://hotels.ctrip.com/hotel/dianping/374783_p3t1.html
#评论爬取
class xc_comment:
def __init__(self):
pass
def savestring(self,path,data):
with open(path,"w",encoding="UTF-8") as fp:
fp.write(data)
#加载酒店数据,每行为一个json
def loadhotel(self):
hotels = {}
with open(self.hotelfile,"r") as fp:
for line in fp:
items = json.loads(line)
for item in items:
hotels[item['id']]= {"name":item["name"],
"id":item["id"],
"city_name":self.cityname,
"star":item["star"],
}
return hotels
#加载城市
def loadCitys(self):
citys = {}
with open("citys.txt","r") as f:
for line in f:
line=line.split(",")
citys[line[0].strip()]=line[1].strip()
return citys
#加载浏览器
def loadDriver(self):
option = webdriver.FirefoxOptions()
#无头模式
option.add_argument("-headless")
#禁止加载图片
option.set_preference('permissions.default.image', 2)
#禁止加载css样式表
option.set_preference('permissions.default.stylesheet',2)
self.firefox = webdriver.Firefox(options=option)
#设置页面加载超时,超过这个时间就会抛出异常,执行之后的代码,不然会卡在一直加载
self.firefox.set_page_load_timeout(5)
self.firefox.set_script_timeout(5)
try:
self.firefox.get("https://hotels.ctrip.com/hotel/479628.html")
except Exception as e:
pass
def parseInfo(self,strw):
allcomments = []
#lxml解析
elements = etree.HTML(strw)
comments = []
print(type(comments))
e_cmt_check_in_dates = elements.xpath("//span[@class='date']/text()")
e_cmt_devices = elements.xpath("//p[@class='comment_bar_info']/i/@class")
e_cmt_causes = elements.xpath("//span[@class='type']/text()")
e_comments = elements.xpath("//div[@class='J_commentDetail']/text()")
e_cmt_room_types = elements.xpath("//a[contains(@class,'room') and contains(@class,'J_baseroom_link')]/text()")
# basides recommend
e_prices = elements.xpath("//tr[@class!='tr-recommend last_room']/td/div/span[@class='base_price']/text()")
prices = []
for price in e_prices:
prices.append({"LowPrice": price, "RoomTotalNum": 1})
for cmt_check_in_date, cmt_device, cmt_cause, comment, cmt_room_type in zip(e_cmt_check_in_dates, e_cmt_devices,
e_cmt_causes, e_comments,
e_cmt_room_types):
result = {}
result["cmt_check_in_date"] = cmt_check_in_date
result["cmt_device"] = cmt_device
result["cmt_cause"] = cmt_cause
result["comment"] = comment
result["cmt_room_type"] = cmt_room_type
result["price"] = prices
allcomments.append(result)
return allcomments
def getcomment(self,hotel):
#酒店的url,这样只会爬取第一页的评论
url='http://hotels.ctrip.com/hotel/dianping/{}_p0t1.html'.format(hotel["id"])
outpath = os.path.join(self.outdir,hotel["id"] + ".txt")
print("outpath:",outpath)
if os.path.exists(outpath) and os.path.getsize(outpath)!=0:
print("skip :",outpath)
return
file = open(outpath,"w")
time.sleep(3)
try:
self.firefox.get(url)
except Exception as e:
pass
#获取网页源码e
strw = self.firefox.page_source
self.savestring("html/{}.html".format(hotel["id"]),strw)
comments = self.parseInfo(strw)
print("comment len:",len(comments))
for comment in comments:
#字典合并,调用后会直接修改comment
comment.update(hotel)
self.savedata(file,comment)
def savedata(self,file,item):
print("save:",item)
strw = json.dumps(item,ensure_ascii=False)
file.write(strw)
file.write("\n")
pass
def craw(self):
hotels = self.loadhotel()
print("酒店数量:",len(hotels))
for key,value in hotels.items():
self.getcomment(value)
pass
def start(self):
self.loadDriver()
citys = self.loadCitys()
craw_citys = ["北京","南宁",'广州']
for city in craw_citys:
print("开始爬取城市:",str(citys[city]))
self.hotelfile = "hotel/hotel_{}.txt".format(citys[city])
self.cityname = city
# 设置输出文件
self.outdir=self.cityname
if not os.path.exists(self.outdir):
print("建立输出文件夹")
os.mkdir(self.outdir)
self.craw()
xc = xc_comment()
xc.start()