Selenium爬取前程无忧51job招聘信息

最新推荐文章于 2024-01-12 23:25:49 发布
Cary丿Xin
最新推荐文章于 2024-01-12 23:25:49 发布
阅读量947
点赞数 1
文章标签： selenium python chrome
本文链接：https://blog.csdn.net/weixin_42254289/article/details/130220323
版权
import csv
import random
import time
from lxml import etree
from selenium import webdriver
#实现规避检测
from selenium.webdriver import ChromeOptions
#实现无可视化界面的
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

#去除浏览器识别,去除Chrome正在受到自动检测软件的控制
chrome_options = ChromeOptions()
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('detach', True)
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

options = Options()
options.add_argument('--headless')  # 设置为无头
options.add_argument('--disable-gpu')  # 设置没有使用gpu
options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')

#实例化一个浏览器对象（传入浏览器的驱动程序）
web = webdriver.Chrome(chrome_options=chrome_options, options=options)
with open('./stealth.min.js') as f:
        js = f.read()
web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": js
})

def modify_detail(job_details):
    job_details = job_details[1:]
    job_details = [x.strip() for x in job_details if x.strip() != '']
    # print('----',job_details)
    # print('mylist列表长度',len(job_details))
    new1_list = []
    new2_list = []
    for k in range(len(job_details)):
        # print('---',job_details[k])
        if '任职资格' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '职位要求' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  'Note' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '任职要求' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '岗位任职条件' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '岗位要求' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '招聘条件' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '应聘要求' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        # else:
        #     new1_list = job_details
    print(new1_list)
    print(new2_list)
    return new1_list, new2_list

#利用xpath和css选择器提取数据
f = open('金融招聘.csv', mode='a', encoding='utf-8',newline='')
csv_write = csv.DictWriter(f,fieldnames=[
        '岗位名称',
        '薪资',
        '公司名称',
        '公司规模',
        '所属行业',
        '工作地点',
        '工作经验',
        '学历要求',
        '关键词',
        '发布日期',
        '招聘详情',
        '岗位职责',
        '任职资格'
        # '上班地址',
        # '联系方式'
])
csv_write.writeheader()  #写入表头

#找到职位框，输入搜索的内容
job_names = ['金融科技','区块链金融','量化投资','金融大数据分析','金融风险分析','金融产品用户运营','金融用户体验']
for j in job_names:
    # 定位输入框并查找相关职位
    # 发起请求
    web.get("https://www.51job.com/")
    time.sleep(5)  # 防止加载缓慢，休眠2秒
    web.find_element(By.XPATH, '//*[@id="kwdselectid"]').click()
    web.find_element(By.XPATH, '//*[@id="kwdselectid"]').clear()
    # 输入招聘岗位名称
    web.find_element(By.XPATH, '//*[@id="kwdselectid"]').send_keys(j)
    web.find_element(By.XPATH, '/html/body/div[3]/div/div[1]/div/button').click()#点击搜索

    time.sleep(10)
    for page in range(1,50):
        print(f'==============正在爬取{page}页信息==================')
        time.sleep(10)
        web.find_element(By.XPATH, '//*[@id="jump_page"]').click()
        time.sleep(random.randint(10, 30) * 0.1)
        web.find_element(By.XPATH, '//*[@id="jump_page"]').clear()
        time.sleep(random.randint(10, 40) * 0.1)
        web.find_element(By.XPATH, '//*[@id="jump_page"]').send_keys(page)
        time.sleep(random.randint(10, 30) * 0.1)
        web.find_element(By.XPATH, '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[3]/div/div/span[3]').click()

        #定位招聘页面所有招聘公司
        time.sleep(5)
        jobData = web.find_elements(By.XPATH, '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[2]/div[1]/div')

        #print(jobData)

        #详情页面工作职责和任职资格划分

        for job in jobData:
            jobName = job.find_element(By.CLASS_NAME, 'jname.at').text                   #职位名称
            time.sleep(random.randint(5, 15) * 0.1)
            jobSalary = job.find_element(By.CLASS_NAME, 'sal').text                      #工作薪资
            time.sleep(random.randint(5, 15) * 0.1)
            jobCompany = job.find_element(By.CLASS_NAME, 'cname.at').text                #公司名称
            time.sleep(random.randint(5, 15) * 0.1)
            company_type_size = job.find_element(By.CLASS_NAME, 'dc.at').text            #公司规模
            time.sleep(random.randint(5, 15) * 0.1)
            company_status = job.find_element(By.CLASS_NAME, 'int.at').text              #所属行业
            time.sleep(random.randint(5, 15) * 0.1)
            address_experience_education = job.find_element(By.CLASS_NAME, 'd.at').text  #地点_经验_学历
            print(address_experience_education)
            length = len(address_experience_education.split('|'))
            if length == 3:
                address = address_experience_education.split('|')[0]            #工作地点
                experience = address_experience_education.split('|')[1]         #工作经验
                edu = address_experience_education.split('|')[2]                #学历要求
            else:
                address = address_experience_education.split('|')[0]  # 工作地点
                experience = '无需经验'
                edu = '学历不限'
            time.sleep(random.randint(5, 15) * 0.1)
            try:
                job_welf = job.find_element(By.CLASS_NAME, 'tags').get_attribute('title')  #关键词
            except:
                job_welf = '无数据'
            time.sleep(random.randint(5, 15) * 0.1)
            update_date = job.find_element(By.CLASS_NAME, 'time').text             #发布日期
            time.sleep(random.randint(5, 15) * 0.1)
            job_href = job.find_element(By.CLASS_NAME, 'el').get_attribute('href') #招聘详情
            #print(job_href)

            #获取当前窗口句柄A
            handle = web.current_window_handle
            try:
                job.click()# 打开招聘详情页
            except:
                continue  #如果当前招聘详情页打不开, 跳过当前招聘公司
            #获得当前所有窗口句柄(窗口A、B)
            handles = web.window_handles
            time.sleep(5)    # 防止加载缓慢，休眠5秒
            for newhandle in handles:
                #筛选新打开的窗口B
                if newhandle != handle:
                    web.switch_to.window(newhandle)#切换到新打开的窗口B
                    d_html = web.page_source
                    html = etree.HTML(d_html)
                    # dt = html.xpath('//div[@class=bmsg.job_msg.inbox]')
                    job_details = html.xpath('/html/body/div[2]/div[2]/div[3]/div[1]/div/text()')
                    ##详情页面工作职责和任职资格划分
                    job_res, job_zige = modify_detail(job_details)
                    web.close()
                    dit = {
                        "岗位名称": jobName,
                        "公司名称": jobCompany,
                        "薪资": jobSalary,
                        "公司规模": company_type_size,
                        "所属行业": company_status,
                        "工作地点": address,
                        "工作经验":experience,
                        "学历要求":edu,
                        "关键词": job_welf,
                        "发布日期": update_date,
                        "招聘详情": job_href,
                        "岗位职责": job_res,
                        "任职资格": job_zige
                    }
                    print(f'正在爬取{jobCompany}公司')
                    print(jobName, jobSalary, jobCompany, company_type_size, company_status, address, experience, edu,
                          job_welf,update_date,job_href,job_res,job_zige)
                    csv_write.writerow(dit)
                web.switch_to.window(handles[0])
            time.sleep(10)