使用selenium配合lxml进行爬取携程翻页

链接:https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9

我们也不管是动态加载还是其他,使用selenium就是将渲染后的源码交给lxm进行解析,selenium缺点就是有点慢,我们还要拿到详情页的链接,以及我们要进行拼翻页链接,我们就不使用模拟点击方法了

在这里插入图片描述

代码实现如下:
from selenium import webdriver
from lxml import etree
import json


class Xiecheng(object):
    def __init__(self):
        self.url = 'https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9'
        self.opition = webdriver.ChromeOptions()
        self.opition.add_argument('--headless')
        self.driver = webdriver.Chrome(
            executable_path='C:\\Users\\Administrator\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe',
            options=self.opition)

    def get_data(self):
        self.driver.get(self.url)
        self.driver.execute_script('var q=document.documentElement.scrollTop=10000')
        self.driver.implicitly_wait(10)
        data = self.driver.page_source
        return data

    def parse_data(self, data):
        html = etree.HTML(data)
        node_list = html.xpath('//div[@class="list_product_item_border"]')
        data_list = list()
        for node in node_list:
            temp = {}
            temp['标题'] = node.xpath('./div/div[2]/p/span/text()')
            id_list = node.xpath('./parent::div[@class="list_product_box js_product_item"]/@data-track-product-id')
            for id in id_list:
                temp['详情页链接'] = 'https://vacations.ctrip.com/travel/detail/p' + id + '/?city=475'
            price = node.xpath(
                './div/div[2]/div/div[1]/div[2]/span/strong/text()|./div/div[2]/div/div[1]/div/span/strong/text()')
            for p in price:
                temp['价格'] = '¥' + p
            temp['描述信息'] = node.xpath('./div/div[2]/div/div[2]/div[2]/div/p/text()')
            temp['供应商'] = node.xpath('./div/div[2]/div/div[2]/p/text()')
            detail_url = temp['详情页链接']
            self.url = detail_url
            self.driver.get(self.url)
            self.driver.execute_script('var q=document.documentElement.scrollTop=10000')
            self.driver.implicitly_wait(10)
            detail_page = self.driver.page_source
            hr = etree.HTML(detail_page)
            temp['详情描述'] = hr.xpath('//p[@class="detail_title_subhead"]/text()')
            temp['服务'] = hr.xpath('//p[@class="detail_title_subhead"]/text()')
            temp['特色'] = hr.xpath('//div[@class="rich_content_view_20191129 '
                                    'detail_description_content_view"]/p/text()')
            temp['日程安排'] = hr.xpath('//div[@class="day_title"]/div[2]/text()')
            temp['景点描述'] = hr.xpath('//div[@id="grp-103047-schedule-poi-0"]/div/text()')
            temp['酒店'] = hr.xpath('//a[@class="itinerary_hotel_item js_Expose_Point js_mapPointHook"]/text()')
            data_list.append(temp)
        return data_list

    def next_page(self, data):
        html = etree.HTML(data)
        try:
            id = html.xpath('//*[@id="root"]/div/div[1]/div[8]/div[31]/a[last()]/@data-page')[0]
            next_url = 'https://vacations.ctrip.com/list/whole/sc475.html?st=%E5%91%A8%E8%BE%B9&startcity=475&sv=%E5%91%A8%E8%BE%B9&p=' + id
            self.url = next_url
            print(self.url)
            return id
        except:
            pass

    def save_data(self, data_list):
        for i in data_list:
            data = json.dumps(i,ensure_ascii=False)
            with open('携程.json', 'ab') as f:
                data = str(data) + ',\n'
                f.write(data.encode())

    def run(self):
        while True:
            data = self.get_data()
            data_list = self.parse_data(data)
            if data_list is None:
                print('空值')
                break
            self.save_data(data_list)
            id = self.next_page(data)
            if id == None:
                break


if __name__ == '__main__':
    xiecheng = Xiecheng()
    xiecheng.run()

运行效果如下:

在这里插入图片描述

保存的数据是json,数据也方便使用
代码实现比较简单,如果有不明白的留言或私信我!!!!!
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值