selenium爬虫boss直聘

import time

from selenium import webdriver

from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class Boss_spydier(object):
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.base_url = "https://www.zhipin.com/job_detail/?query=python&scity=101010100&industry=&position="
        self.positions = []

    def run(self):
        self.driver.get(self.base_url)
        source = self.driver.page_source
        self.parse_list_page(source)

    def parse_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath(".//div[@class='info-primary']/h3[@class='name']/a/@href")
        for link in links:
            link = 'https://www.zhipin.com/' + link
            self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self, url):
        self.driver.execute_script("window.open('%s')" % url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        WebDriverWait(driver=self.driver, timeout=10).until(
            EC.presence_of_element_located((By.XPATH, "//*[@id='main']"))
        )
        source = self.driver.page_source
        self.parse_detail_page(source)
        time.sleep(2)
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_detail_page(self, source):
        html = etree.HTML(source)
        job_name = html.xpath(".//div[@class='name']/h1/text()")[0].strip()
        salary = html.xpath(".//span[@class='badge']/text()")[0].strip()
        city = html.xpath(".//div[@class='info-primary']/p/text()")[0].strip()
        work_year = html.xpath(".//div[@class='info-primary']/p/text()")[1].strip()
        education = html.xpath(".//div[@class='info-primary']/p/text()")[2].strip()
        company = html.xpath(".//div[@class='info-company']/h3[@class='name']/a/text()")[0].strip()
        # infos = html.xpath("div[@class='text']/text()")
        # desc = ""
        # yaoqiu = ""
        # tuandui = ""
        # for index, info in enumerate(infos):
        #     if info.startswith("【职位描述】"):
        #         for x in range(index + 1, len(infos)):
        #             desc = infos[x].strip()
        #             if desc.startswith("【"):
        #                 break
        #             print(desc, "@@@@")
        #     elif info.startswith("【职位要求】"):
        #         for x in range(index + 1, len(infos)):
        #             yaoqiu = infos[x].strip()
        #             if yaoqiu.startswith("【"):
        #                 break
        #     elif info.startswith("团队介绍"):
        #         for x in range(index + 1, len(infos)):
        #             tuandui = infos[x].strip()
        #             if tuandui.startswith("公司介绍"):
        #                 break

        position = {
            'job_name': job_name,
            'salary': salary,
            'city': city,
            'work_year': work_year,
            'education': education,
            'company': company,
        }

        self.positions.append(position)
        print(position)
        print("#" * 40)


if __name__ == '__main__':
    boss_spider = Boss_spydier()
    boss_spider.run()

待完善。。。。。。。。。。。。。。。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值