selenium完整爬取拉勾网全国java岗位

最新推荐文章于 2021-04-30 20:34:44 发布

国际渣男

最新推荐文章于 2021-04-30 20:34:44 发布

阅读量3.8k

点赞数 2

分类专栏： Python

本文链接：https://blog.csdn.net/qq_38900565/article/details/96512508

版权

Python 专栏收录该内容

25 篇文章 0 订阅

订阅专栏

爬取全国java岗位

from selenium import webdriver
from lxml import etree
import re
import time


class LagouSpider(object):
    driver_path = r'D:\chromedriver\chromedriver.exe'

    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
        self.url = 'https://www.lagou.com/jobs/list_java?labelWords=&fromSearch=true&suginput='

    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            self._parse_list_page(source)
            time.sleep(3)

            next_btn = self.driver.find_element_by_xpath("//*[@id='s_position_list']/div[2]/div/span[last()]")
            if 'pager_next pager_next_disabled' not in next_btn.get_attribute('class'):
                next_btn.click()
                time.sleep(2)
            else:
                break
        time.sleep(10)
        self.driver.close()

    def _parse_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:
            result = self._parse_detail_page(link)  # 请求每个详情页
            self._show_or_store(result)  # 展示/存储数据

    def _parse_detail_page(self, link): # 爬取详情页【页面切换】
        # self.driver.get(link)
        self.driver.execute_script("window.open('%s')" % link)
        self.driver.switch_to.window(self.driver.window_handles[1])
        print('当前页面：%s' % self.driver.current_url)
        time.sleep(6)   # 每次抓取详情页等待
        source = self.driver.page_source
        html = etree.HTML(source)
        position_name = html.xpath("//div[@class='job-name']/@title")[0]
        company = html.xpath("//*[@id='job_company']/dt/a/div/h3/em/text()")[0].strip()
        temp = []
        result = []
        for i in range(1, 6):
            xpath_url = "/html/body/div[4]/div/div[1]/dd/h3/span[%d]/text()" % i
            item = html.xpath(xpath_url)[0]
            temp.append(re.sub(r'/', '', item))
        position = {
            'name': position_name, 'company': company, 'salary': temp[0], 'address': temp[1],
            'work_year': temp[2], 'education': temp[3], 'desc': temp[4]
        }
        result.append(position)
        # for i in result:
        #     print(i)
        #     print('###' * 50)
        self.driver.close()  # 关闭当前页
        self.driver.switch_to.window(self.driver.window_handles[0])  
        return result

    def _show_or_store(self, result):
        for i in result:
            print(i)
            print('###' * 40)


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()