selenium爬虫boss直聘

最新推荐文章于 2024-07-23 09:28:17 发布

ShirleyQueen321

最新推荐文章于 2024-07-23 09:28:17 发布

阅读量821

点赞数

分类专栏： python 爬虫

本文链接：https://blog.csdn.net/weixin_40569991/article/details/82023657

版权

python 同时被 2 个专栏收录

30 篇文章 0 订阅

订阅专栏

爬虫

18 篇文章 0 订阅

订阅专栏

import time

from selenium import webdriver

from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class Boss_spydier(object):
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.base_url = "https://www.zhipin.com/job_detail/?query=python&scity=101010100&industry=&position="
        self.positions = []

    def run(self):
        self.driver.get(self.base_url)
        source = self.driver.page_source
        self.parse_list_page(source)

    def parse_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath(".//div[@class='info-primary']/h3[@class='name']/a/@href")
        for link in links:
            link = 'https://www.zhipin.com/' + link
            self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self, url):
        self.driver.execute_script("window.open('%s')" % url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        WebDriverWait(driver=self.driver, timeout=10).until(
            EC.presence_of_element_located((By.XPATH, "//*[@id='main']"))
        )
        source = self.driver.page_source
        self.parse_detail_page(source)
        time.sleep(2)
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        html = etree.HTML(source)
        job_name = html.xpath(".//div[@class='name']/h1/text()")[0].strip()
        salary = html.xpath(".//span[@class='badge']/text()")[0].strip()
        city = html.xpath(".//div[@class='info-primary']/p/text()")[0].strip()
        work_year = html.xpath(".//div[@class='info-primary']/p/text()")[1].strip()
        education = html.xpath(".//div[@class='info-primary']/p/text()")[2].strip()
        company = html.xpath(".//div[@class='info-company']/h3[@class='name']/a/text()")[0].strip()
        # infos = html.xpath("div[@class='text']/text()")
        # desc = ""
        # yaoqiu = ""
        # tuandui = ""
        # for index, info in enumerate(infos):
        #     if info.startswith("【职位描述】"):
        #         for x in range(index + 1, len(infos)):
        #             desc = infos[x].strip()
        #             if desc.startswith("【"):
        #                 break
        #             print(desc, "@@@@")
        #     elif info.startswith("【职位要求】"):
        #         for x in range(index + 1, len(infos)):
        #             yaoqiu = infos[x].strip()
        #             if yaoqiu.startswith("【"):
        #                 break
        #     elif info.startswith("团队介绍"):
        #         for x in range(index + 1, len(infos)):
        #             tuandui = infos[x].strip()
        #             if tuandui.startswith("公司介绍"):
        #                 break

        position = {
            'job_name': job_name,
            'salary': salary,
            'city': city,
            'work_year': work_year,
            'education': education,
            'company': company,
        }

        self.positions.append(position)
        print(position)
        print("#" * 40)


if __name__ == '__main__':
    boss_spider = Boss_spydier()
    boss_spider.run()

待完善。。。。。。。。。。。。。。。

ShirleyQueen321

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
selenium爬虫boss直聘

import timefrom selenium import webdriverfrom lxml import etreefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.suppo...
复制链接

扫一扫

专栏目录