Selenium 模拟浏览器爬虫禁止加载图片和 css

最新推荐文章于 2023-05-26 13:32:37 发布

FormatFa

最新推荐文章于 2023-05-26 13:32:37 发布

阅读量7.4k

点赞数 3

分类专栏： Python爬虫

本文链接：https://blog.csdn.net/FormatFa/article/details/88680303

版权

Python爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

在爬取某网站的评论时，评论和一些是ajax请求的，而且不是很好找，因为有个eleven（想知道的百度"eleven参数"）参数很难搞（之前搞出过一次）。

用Selenium只要有下面（目前技术也只能用这个）

因为评论是ajax请求后再添加到页面的，就是在客户端上渲染的，如果之前使用requests的get获取到的只能是渲染前的，没有评论数据
Selenium get 后可以用 page_source 来获取源码，这里的page_source是js在本地渲染完后的,可以直接获取后用xpath解析

缺点:

速度不快（网站有验证，爬太快反而被封）

Selenium 坑总结

浏览器设置无头模式(headless)
设置浏览器不加载图片，不加载css（提高速度
设置页面加载和脚本执行的超时时间

爬虫代码，加载浏览器部分

 #加载浏览器
    def loadDriver(self):

        option = webdriver.FirefoxOptions()
        #无头模式
        option.add_argument("-headless")
        #禁止加载图片
        option.set_preference('permissions.default.image', 2)
        #禁止加载css样式表
        option.set_preference('permissions.default.stylesheet',2)
        self.firefox = webdriver.Firefox(options=option)
        #设置页面加载超时，超过这个时间就会抛出异常，执行之后的代码，不然会卡在>一直加载
        self.firefox.set_page_load_timeout(5)
        self.firefox.set_script_timeout(5)
        try:
            #超时会抛出异常，这里try一下
            self.firefox.get("https://hotels.ctrip.com/hotel/479628.html")
        except Exception as e:
            pass

这里的 permissions.default.image 参数是可以在火狐浏览器的config里找到的，但这个是没找到的permissions.default.stylesheet

代码里的citys.txt和对应的hotel_id.txt，是之前爬的，这些都可以直接爬，没有什么反爬（city是爬取移动版网页时候的）

爬虫代码

from selenium import webdriver
import os
import json
import time
import json
from lxml import etree



#http://hotels.ctrip.com/hotel/dianping/374783_p3t1.html



#评论爬取
class xc_comment:
    def __init__(self):
        pass

    def savestring(self,path,data):
        with open(path,"w",encoding="UTF-8") as fp:
            fp.write(data)

    #加载酒店数据，每行为一个json
    def loadhotel(self):
        hotels = {}
        with open(self.hotelfile,"r") as fp:
            for line in fp:
                items = json.loads(line)
                for item in items:
                    hotels[item['id']]= {"name":item["name"],
                                         "id":item["id"],
                                         "city_name":self.cityname,
                                         "star":item["star"],
                                         }
        return hotels

    #加载城市
    def loadCitys(self):
        citys = {}
        with open("citys.txt","r") as f:

            for line in f:

                line=line.split(",")

                citys[line[0].strip()]=line[1].strip()
        return citys

    #加载浏览器
    def loadDriver(self):

        option = webdriver.FirefoxOptions()
        #无头模式
        option.add_argument("-headless")
        #禁止加载图片
        option.set_preference('permissions.default.image', 2)
        #禁止加载css样式表
        option.set_preference('permissions.default.stylesheet',2)
        self.firefox = webdriver.Firefox(options=option)
        #设置页面加载超时，超过这个时间就会抛出异常，执行之后的代码，不然会卡在一直加载
        self.firefox.set_page_load_timeout(5)
        self.firefox.set_script_timeout(5)
        try:

            self.firefox.get("https://hotels.ctrip.com/hotel/479628.html")
        except Exception as e:
            pass





    def parseInfo(self,strw):

        allcomments = []
        #lxml解析
        elements = etree.HTML(strw)

        comments = []


        print(type(comments))

        e_cmt_check_in_dates = elements.xpath("//span[@class='date']/text()")
        e_cmt_devices = elements.xpath("//p[@class='comment_bar_info']/i/@class")
        e_cmt_causes = elements.xpath("//span[@class='type']/text()")
        e_comments = elements.xpath("//div[@class='J_commentDetail']/text()")
        e_cmt_room_types = elements.xpath("//a[contains(@class,'room') and contains(@class,'J_baseroom_link')]/text()")

        # basides recommend
        e_prices = elements.xpath("//tr[@class!='tr-recommend last_room']/td/div/span[@class='base_price']/text()")




        prices = []

        for price in e_prices:
            prices.append({"LowPrice": price, "RoomTotalNum": 1})

        for cmt_check_in_date, cmt_device, cmt_cause, comment, cmt_room_type in zip(e_cmt_check_in_dates, e_cmt_devices,
                                                                                    e_cmt_causes, e_comments,
                                                                        e_cmt_room_types):
            result = {}


            result["cmt_check_in_date"] = cmt_check_in_date
            result["cmt_device"] = cmt_device
            result["cmt_cause"] = cmt_cause
            result["comment"] = comment
            result["cmt_room_type"] = cmt_room_type

            result["price"] = prices
            allcomments.append(result)



        return allcomments

    def getcomment(self,hotel):
        
        #酒店的url，这样只会爬取第一页的评论
        url='http://hotels.ctrip.com/hotel/dianping/{}_p0t1.html'.format(hotel["id"])
        outpath = os.path.join(self.outdir,hotel["id"] + ".txt")
        print("outpath:",outpath)

        if os.path.exists(outpath) and os.path.getsize(outpath)!=0:
            print("skip :",outpath)
            return
        file = open(outpath,"w")
        time.sleep(3)
        try:
            self.firefox.get(url)
        except Exception as e:
            pass


        #获取网页源码e
        strw = self.firefox.page_source
        self.savestring("html/{}.html".format(hotel["id"]),strw)
        comments  = self.parseInfo(strw)
        print("comment len:",len(comments))
        for comment in comments:
            #字典合并，调用后会直接修改comment
            comment.update(hotel)
            self.savedata(file,comment)


    def savedata(self,file,item):


        print("save:",item)

        strw = json.dumps(item,ensure_ascii=False)
        file.write(strw)
        file.write("\n")


        pass
    def craw(self):


        hotels = self.loadhotel()



        print("酒店数量:",len(hotels))

        for key,value in hotels.items():

            self.getcomment(value)




        pass
    def start(self):

        self.loadDriver()

        citys = self.loadCitys()
        craw_citys = ["北京","南宁",'广州']
        for city in craw_citys:

            print("开始爬取城市:",str(citys[city]))
            self.hotelfile = "hotel/hotel_{}.txt".format(citys[city])
            self.cityname = city
            # 设置输出文件
            self.outdir=self.cityname
            if not  os.path.exists(self.outdir):
                print("建立输出文件夹")
                os.mkdir(self.outdir)
            self.craw()


xc = xc_comment()

xc.start()