selenium的使用

xxx网的反爬做的很好,按照正常套路来,请求头一旦出现一点问题就会被识别出来,如果使用selenium的话可以将网页的源码直接拿下来,所以思路就有了。

我爬的python的岗位内容。

class LaGou(object):
    # 设置防检测
    option = webdriver.ChromeOptions()
    option.add_experimental_option('useAutomationExtension', False)
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver_path = r'E:/chromedriver/chromedriver.exe'

    # 初始化方法
    def __init__(self, url):
        self.driver = webdriver.Chrome(executable_path=LaGou.driver_path, options=LaGou.option)
        self.url = url
        self.__host = 'localhost'
        self.__port = 3306
        self.__user = 'root'
        self.__password = 'xxxxx'
        self.__db = 'xxxx'
        self.__charset = 'utf8'

      # 连接MySQL
    def connect(self):
        self.__conn = pymysql.connect(
            host=self.__host,
            port=self.__port,
            user=self.__user,
            password=self.__password,
            db=self.__db,
            charset=self.__charset
        )

首先解析出首页里面详情页的url,用selenium向首页发起请求

# 请求首页
    def get_html(self):
        page_list = []
        self.driver.get(self.url)

        time.sleep(random.randint(1, 3))
        for page in range(1, 2):
            a = self.driver.find_element_by_xpath('//*[@id="s_position_list"]/div[2]/div/span[6]')
            self.driver.execute_script('arguments[0].click();', a)
            time.sleep(random.randint(1, 3))
            page_text = self.driver.page_source
            page_list.append(page_text)
            return page_list

    # 解析出详情页url
    def parse_detail_url(self, html):
        ul_list = etree.HTML(html)
        ul = ul_list.xpath('//*[@id="s_position_list"]/ul')
        for li in ul:
            detail_url = li.xpath('./li/div[1]/div[1]/div[1]/a/@href')
            return detail_url

把详情页的url解析出来以后在使用selenium向详情页发起请求拿到详情页的具体数据

  # 请求详情页
    def get_detail_data(self, url):
        detail_data_list = []
        self.driver.get(url)
        time.sleep(random.randint(1, 3))
        detail_data = self.driver.page_source
        detail_data_list.append(detail_data)
        return detail_data_list

    # 解析详情页具体数据
    def parse_detail_data(self, html):
        detail_data = etree.HTML(html)
        div_list = detail_data.xpath('//*[@id="__next"]/div[2]')
        for all_data in div_list:
            # 职位名称
            position = all_data.xpath('./div[1]/div/div[1]/div[1]/h1/span/span/span[1]/text()')[0]
            # 薪资
            pay = all_data.xpath('./div[1]/div/div[1]/div[1]/h1/span/span/span[2]/text()')[0]
            # 经验
            experience = all_data.xpath('./div[1]/div/div[1]/dd/h3/span[2]/text()')[0]
            # 学历
            education = all_data.xpath('./div[1]/div/div[1]/dd/h3/span[3]/text()')[0].replace('/', '')
            # 岗位
            job_title = all_data.xpath('./div[1]/div/div[1]/dd/h3/div/span[2]/text()')[0]
            # 职位描述
            job_description = all_data.xpath('./div[2]/div[1]/dl[1]/dd[2]/div/text()')
            t_job_description = ''.join(job_description).replace('\n', '')

            all_data_dict = {
                'position': position,
                'pay': pay,
                'experience': experience,
                'education': education,
                'job_title': job_title,
                't_job_description': t_job_description
            }
            self.save_mysql(position, pay, experience, education, job_title, t_job_description)

因为xpath拿到的是列表元素所以直接拿[0]个元素就行

将拿到的数据保存到mysql

# 持久化存储到MySQL
    def save_mysql(self, position, pay, experience, education, job_title, t_job_description):
        self.connect()
        cursor = self.__conn.cursor()
        sql = (
            'insert into python_lagou(position, pay, experience, education, job_title, t_job_description) values (%s, %s, %s, %s, %s, %s)',
            position, pay, experience, education, job_title, t_job_description)
        try:
            cursor.execute(sql)
        except Exception as e:
            print('Error:', e)
            self.__conn.rollback()
        finally:
            cursor.close()
            self.__conn.close()
            self.__conn.commit()

最后执行main方法

 # main方法
    def main(self):
        html_list = self.get_html()
        for html in html_list:
            for url in self.parse_detail_url(html):
                for data in self.get_detail_data(url):
                    self.parse_detail_data(data)

        t_list = []
        # 开启多线程
        for i in range(10):
            run = LaGou('https://www.xxx.com')
            t_list.append(run)

        thread_list = []
        for i in t_list:
            t = Thread(target=i.main, args=())
            thread_list.append(t)

        for t in thread_list:
            t.start()

        for t in thread_list:
            t.join()

完整代码

class LaGou(object):
    # 设置防检测
    option = webdriver.ChromeOptions()
    option.add_experimental_option('useAutomationExtension', False)
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver_path = r'E:/chromedriver/chromedriver.exe'

    # 初始化方法
    def __init__(self, url):
        self.driver = webdriver.Chrome(executable_path=LaGou.driver_path, options=LaGou.option)
        self.url = url
        self.__host = 'localhost'
        self.__port = 3306
        self.__user = 'root'
        self.__password = 'xxx'
        self.__db = 'xxx'
        self.__charset = 'utf8'

    # 连接MySQL
    def connect(self):
        self.__conn = pymysql.connect(
            host=self.__host,
            port=self.__port,
            user=self.__user,
            password=self.__password,
            db=self.__db,
            charset=self.__charset
        )

    # 请求首页
    def get_html(self):
        page_list = []
        self.driver.get(self.url)

        time.sleep(random.randint(1, 3))
        for page in range(1, 2):
            a = self.driver.find_element_by_xpath('//*[@id="s_position_list"]/div[2]/div/span[6]')
            self.driver.execute_script('arguments[0].click();', a)
            time.sleep(random.randint(1, 3))
            page_text = self.driver.page_source
            page_list.append(page_text)
            return page_list

    # 解析出详情页url
    def parse_detail_url(self, html):
        ul_list = etree.HTML(html)
        ul = ul_list.xpath('//*[@id="s_position_list"]/ul')
        for li in ul:
            detail_url = li.xpath('./li/div[1]/div[1]/div[1]/a/@href')
            return detail_url

    # 请求详情页
    def get_detail_data(self, url):
        detail_data_list = []
        self.driver.get(url)
        time.sleep(random.randint(1, 3))
        detail_data = self.driver.page_source
        detail_data_list.append(detail_data)
        return detail_data_list

    # 解析详情页具体数据
    def parse_detail_data(self, html):
        detail_data = etree.HTML(html)
        div_list = detail_data.xpath('//*[@id="__next"]/div[2]')
        for all_data in div_list:
            # 职位名称
            position = all_data.xpath('./div[1]/div/div[1]/div[1]/h1/span/span/span[1]/text()')[0]
            # 薪资
            pay = all_data.xpath('./div[1]/div/div[1]/div[1]/h1/span/span/span[2]/text()')[0]
            # 经验
            experience = all_data.xpath('./div[1]/div/div[1]/dd/h3/span[2]/text()')[0]
            # 学历
            education = all_data.xpath('./div[1]/div/div[1]/dd/h3/span[3]/text()')[0].replace('/', '')
            # 岗位
            job_title = all_data.xpath('./div[1]/div/div[1]/dd/h3/div/span[2]/text()')[0]
            # 职位描述
            job_description = all_data.xpath('./div[2]/div[1]/dl[1]/dd[2]/div/text()')
            t_job_description = ''.join(job_description).replace('\n', '')

            all_data_dict = {
                'position': position,
                'pay': pay,
                'experience': experience,
                'education': education,
                'job_title': job_title,
                't_job_description': t_job_description
            }
            self.save_mysql(position, pay, experience, education, job_title, t_job_description)

    # 持久化存储到MySQL
    def save_mysql(self, position, pay, experience, education, job_title, t_job_description):
        self.connect()
        cursor = self.__conn.cursor()
        sql = (
            'insert into python_lagou(position, pay, experience, education, job_title, t_job_description) values (%s, %s, %s, %s, %s, %s)',
            position, pay, experience, education, job_title, t_job_description)
        try:
            cursor.execute(sql)
        except Exception as e:
            print('Error:', e)
            self.__conn.rollback()
        finally:
            cursor.close()
            self.__conn.close()
            self.__conn.commit()

    # main方法
    def main(self):
        html_list = self.get_html()
        for html in html_list:
            for url in self.parse_detail_url(html):
                for data in self.get_detail_data(url):
                    self.parse_detail_data(data)

        t_list = []
        # 开启多线程
        for i in range(10):
            run = LaGou('https://www.xxx.com')
            t_list.append(run)

        thread_list = []
        for i in t_list:
            t = Thread(target=i.main, args=())
            thread_list.append(t)

        for t in thread_list:
            t.start()

        for t in thread_list:
            t.join()


if __name__ == '__main__':
    run = LaGou('https://www.xxx.com')
    run.main()

到这里就结束啦!

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
资源宝贵,请及时下载 Book Description The Selenium WebDriver 3.x Technology is an open source API available to test both Browser and Mobile applications. It is completely platform independent in that tests built for one browser or mobile device, will also work on all other browsers and mobile devices. Selenium supports all major development languages which allow it to be tied directly into the technology used to develop the applications. This guide will provide a step-by-step approach to designing and building a data-driven test framework using Selenium WebDriver, Java, and TestNG. The book starts off by introducing users to the Selenium Page Object Design Patterns and D.R.Y Approaches to Software Development. In doing so, it covers designing and building a Selenium WebDriver framework that supports both Browser and Mobile Devices. It will lead the user through a journey of architecting their own framework with a scalable driver class, Java utility classes, JSON Data Provider, Data-Driven Test Classes, and support for third party tools and plugins. Users will learn how to design and build a Selenium Grid from scratch to allow the framework to scale and support different browsers, mobile devices, versions, and platforms, and how they can leverage third party grids in the Cloud like SauceLabs. Other topics covered include designing abstract base and sub-classes, inheritance, dual-driver support, parallel testing, testing multi-branded applications, best practices for using locators, and data encapsulation. Finally, you will be presented with a sample fully-functional framework to get them up and running with the Selenium WebDriver for browser testing. By the end of the book, you will be able to design your own automation testing framework and perform data-driven testing with Selenium WebDriver. Contents 1: BUILDING A SCALABLE SELENIUM TEST DRIVER CLASS FOR WEB AND MOBILE APPLICATIONS 2: SELENIUM FRAMEWORK UTILITY CLASSES 3: BEST PRACTICES FOR BUILDING SELENIUM PAGE OBJECT CLAS

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

阿里多多酱a

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值