html 设置不允许爬虫,Selenium 模拟浏览器爬虫禁止加载图片和 css

最新推荐文章于 2022-01-06 17:09:36 发布

weixin_39926678

最新推荐文章于 2022-01-06 17:09:36 发布

阅读量1.1k

点赞数

文章标签： html 设置不允许爬虫

在爬取某网站的评论时，评论和一些是ajax请求的，并且不是很好找，由于有个eleven(想知道的百度"eleven参数")参数很难搞(以前搞出过一次)。css

用Selenium只要有下面(目前技术也只能用这个)

由于评论是ajax请求后再添加到页面的，就是在客户端上渲染的，若是以前使用requests的get获取到的只能是渲染前的，没有评论数据

Selenium get 后能够用 page_source 来获取源码，这里的page_source是js在本地渲染完后的,能够直接获取后用xpath解析

缺点:

速度不快(网站有验证，爬太快反而被封)

Selenium 坑总结

浏览器设置无头模式(headless)

设置浏览器不加载图片，不加载css(提升速度

设置页面加载和脚本执行的超时时间

爬虫代码，加载浏览器部分

#加载浏览器

def loadDriver(self):

option = webdriver.FirefoxOptions()

#无头模式

option.add_argument("-headless")

#禁止加载图片

option.set_preference('permissions.default.image', 2)

#禁止加载css样式表

option.set_preference('permissions.default.stylesheet',2)

self.firefox = webdriver.Firefox(options=option)

#设置页面加载超时，超过这个时间就会抛出异常，执行以后的代码，否则会卡在>一直加载

self.firefox.set_page_load_timeout(5)

self.firefox.set_script_timeout(5)

try:

#超时会抛出异常，这里try一下

self.firefox.get("https://hotels.ctrip.com/hotel/479628.html")

except Exception as e:

pass

这里的 permissions.default.image 参数是能够在火狐浏览器的config里找到的，但这个是没找到的permissions.default.stylesheethtml

代码里的citys.txt和对应的hotel_id.txt，是以前爬的，这些均可以直接爬，没有什么反爬 (city是爬取移动版网页时候的)python

爬虫代码

from selenium import webdriver

import os

import json

import time

import json

from lxml import etree

#http://hotels.ctrip.com/hotel/dianping/374783_p3t1.html

#评论爬取

class xc_comment:

def __init__(self):

pass

def savestring(self,path,data):

with open(path,"w",encoding="UTF-8") as fp:

fp.write(data)

#加载酒店数据，每行为一个json

def loadhotel(self):

hotels = {}

with open(self.hotelfile,"r") as fp:

for line in fp:

items = json.loads(line)

for item in items:

hotels[item['id']]= {"name":item["name"],

"id":item["id"],

"city_name":self.cityname,

"star":item["star"],

}

return hotels

#加载城市

def loadCitys(self):

citys = {}

with open("citys.txt","r") as f:

for line in f:

line=line.split(",")

citys[line[0].strip()]=line[1].strip()

return citys

#加载浏览器

def loadDriver(self):

option = webdriver.FirefoxOptions()

#无头模式

option.add_argument("-headless")

#禁止加载图片

option.set_preference('permissions.default.image', 2)

#禁止加载css样式表

option.set_preference('permissions.default.stylesheet',2)

self.firefox = webdriver.Firefox(options=option)

#设置页面加载超时，超过这个时间就会抛出异常，执行以后的代码，否则会卡在一直加载

self.firefox.set_page_load_timeout(5)

self.firefox.set_script_timeout(5)

try:

self.firefox.get("https://hotels.ctrip.com/hotel/479628.html")

except Exception as e:

pass

def parseInfo(self,strw):

allcomments = []

#lxml解析

elements = etree.HTML(strw)

comments = []

print(type(comments))

e_cmt_check_in_dates = elements.xpath("//span[@class='date']/text()")

e_cmt_devices = elements.xpath("//p[@class='comment_bar_info']/i/@class")

e_cmt_causes = elements.xpath("//span[@class='type']/text()")

e_comments = elements.xpath("//div[@class='J_commentDetail']/text()")

e_cmt_room_types = elements.xpath("//a[contains(@class,'room') and contains(@class,'J_baseroom_link')]/text()")

# basides recommend

e_prices = elements.xpath("//tr[@class!='tr-recommend last_room']/td/div/span[@class='base_price']/text()")

prices = []

for price in e_prices:

prices.append({"LowPrice": price, "RoomTotalNum": 1})

for cmt_check_in_date, cmt_device, cmt_cause, comment, cmt_room_type in zip(e_cmt_check_in_dates, e_cmt_devices,

e_cmt_causes, e_comments,

e_cmt_room_types):

result = {}

result["cmt_check_in_date"] = cmt_check_in_date

result["cmt_device"] = cmt_device

result["cmt_cause"] = cmt_cause

result["comment"] = comment

result["cmt_room_type"] = cmt_room_type

result["price"] = prices

allcomments.append(result)

return allcomments

def getcomment(self,hotel):

#酒店的url，这样只会爬取第一页的评论

url='http://hotels.ctrip.com/hotel/dianping/{}_p0t1.html'.format(hotel["id"])

outpath = os.path.join(self.outdir,hotel["id"] + ".txt")

print("outpath:",outpath)

if os.path.exists(outpath) and os.path.getsize(outpath)!=0:

print("skip :",outpath)

return

file = open(outpath,"w")

time.sleep(3)

try:

self.firefox.get(url)

except Exception as e:

pass

#获取网页源码e

strw = self.firefox.page_source

self.savestring("html/{}.html".format(hotel["id"]),strw)

comments = self.parseInfo(strw)

print("comment len:",len(comments))

for comment in comments:

#字典合并，调用后会直接修改comment

comment.update(hotel)

self.savedata(file,comment)

def savedata(self,file,item):

print("save:",item)

strw = json.dumps(item,ensure_ascii=False)

file.write(strw)

file.write("\n")

pass

def craw(self):

hotels = self.loadhotel()

print("酒店数量:",len(hotels))

for key,value in hotels.items():

self.getcomment(value)

pass

def start(self):

self.loadDriver()

citys = self.loadCitys()

craw_citys = ["北京","南宁",'广州']

for city in craw_citys:

print("开始爬取城市:",str(citys[city]))

self.hotelfile = "hotel/hotel_{}.txt".format(citys[city])

self.cityname = city

# 设置输出文件

self.outdir=self.cityname

if not os.path.exists(self.outdir):

print("创建输出文件夹")

os.mkdir(self.outdir)

self.craw()

xc = xc_comment()

xc.start()

html 设置不允许爬虫,Selenium 模拟浏览器 爬虫 禁止加载图片 和 css

html 设置不允许爬虫,Selenium 模拟浏览器爬虫禁止加载图片和 css