Selenium爬取前程无忧51job招聘信息

import csv
import random
import time
from lxml import etree
from selenium import webdriver
#实现规避检测
from selenium.webdriver import ChromeOptions
#实现无可视化界面的
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

#去除浏览器识别,去除Chrome正在受到自动检测软件的控制
chrome_options = ChromeOptions()
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('detach', True)
chrome_options.add_argument("--disable-blink-features=AutomationControlled")

options = Options()
options.add_argument('--headless')  # 设置为无头
options.add_argument('--disable-gpu')  # 设置没有使用gpu
options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')

#实例化一个浏览器对象(传入浏览器的驱动程序)
web = webdriver.Chrome(chrome_options=chrome_options, options=options)
with open('./stealth.min.js') as f:
        js = f.read()
web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": js
})

def modify_detail(job_details):
    job_details = job_details[1:]
    job_details = [x.strip() for x in job_details if x.strip() != '']
    # print('----',job_details)
    # print('mylist列表长度',len(job_details))
    new1_list = []
    new2_list = []
    for k in range(len(job_details)):
        # print('---',job_details[k])
        if '任职资格' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '职位要求' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  'Note' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '任职要求' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '岗位任职条件' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '岗位要求' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '招聘条件' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        elif  '应聘要求' in job_details[k]:
            new2_list = job_details[k+1:]
            new1_list = job_details[:k]
        # else:
        #     new1_list = job_details
    print(new1_list)
    print(new2_list)
    return new1_list, new2_list

#利用xpath和css选择器提取数据
f = open('金融招聘.csv', mode='a', encoding='utf-8',newline='')
csv_write = csv.DictWriter(f,fieldnames=[
        '岗位名称',
        '薪资',
        '公司名称',
        '公司规模',
        '所属行业',
        '工作地点',
        '工作经验',
        '学历要求',
        '关键词',
        '发布日期',
        '招聘详情',
        '岗位职责',
        '任职资格'
        # '上班地址',
        # '联系方式'
])
csv_write.writeheader()  #写入表头

#找到职位框,输入搜索的内容
job_names = ['金融科技','区块链金融','量化投资','金融大数据分析','金融风险分析','金融产品用户运营','金融用户体验']
for j in job_names:
    # 定位输入框并查找相关职位
    # 发起请求
    web.get("https://www.51job.com/")
    time.sleep(5)  # 防止加载缓慢,休眠2秒
    web.find_element(By.XPATH, '//*[@id="kwdselectid"]').click()
    web.find_element(By.XPATH, '//*[@id="kwdselectid"]').clear()
    # 输入招聘岗位名称
    web.find_element(By.XPATH, '//*[@id="kwdselectid"]').send_keys(j)
    web.find_element(By.XPATH, '/html/body/div[3]/div/div[1]/div/button').click()#点击搜索

    time.sleep(10)
    for page in range(1,50):
        print(f'==============正在爬取{page}页信息==================')
        time.sleep(10)
        web.find_element(By.XPATH, '//*[@id="jump_page"]').click()
        time.sleep(random.randint(10, 30) * 0.1)
        web.find_element(By.XPATH, '//*[@id="jump_page"]').clear()
        time.sleep(random.randint(10, 40) * 0.1)
        web.find_element(By.XPATH, '//*[@id="jump_page"]').send_keys(page)
        time.sleep(random.randint(10, 30) * 0.1)
        web.find_element(By.XPATH, '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[3]/div/div/span[3]').click()

        #定位招聘页面所有招聘公司
        time.sleep(5)
        jobData = web.find_elements(By.XPATH, '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[2]/div[1]/div')

        #print(jobData)

        #详情页面工作职责和任职资格划分

        for job in jobData:
            jobName = job.find_element(By.CLASS_NAME, 'jname.at').text                   #职位名称
            time.sleep(random.randint(5, 15) * 0.1)
            jobSalary = job.find_element(By.CLASS_NAME, 'sal').text                      #工作薪资
            time.sleep(random.randint(5, 15) * 0.1)
            jobCompany = job.find_element(By.CLASS_NAME, 'cname.at').text                #公司名称
            time.sleep(random.randint(5, 15) * 0.1)
            company_type_size = job.find_element(By.CLASS_NAME, 'dc.at').text            #公司规模
            time.sleep(random.randint(5, 15) * 0.1)
            company_status = job.find_element(By.CLASS_NAME, 'int.at').text              #所属行业
            time.sleep(random.randint(5, 15) * 0.1)
            address_experience_education = job.find_element(By.CLASS_NAME, 'd.at').text  #地点_经验_学历
            print(address_experience_education)
            length = len(address_experience_education.split('|'))
            if length == 3:
                address = address_experience_education.split('|')[0]            #工作地点
                experience = address_experience_education.split('|')[1]         #工作经验
                edu = address_experience_education.split('|')[2]                #学历要求
            else:
                address = address_experience_education.split('|')[0]  # 工作地点
                experience = '无需经验'
                edu = '学历不限'
            time.sleep(random.randint(5, 15) * 0.1)
            try:
                job_welf = job.find_element(By.CLASS_NAME, 'tags').get_attribute('title')  #关键词
            except:
                job_welf = '无数据'
            time.sleep(random.randint(5, 15) * 0.1)
            update_date = job.find_element(By.CLASS_NAME, 'time').text             #发布日期
            time.sleep(random.randint(5, 15) * 0.1)
            job_href = job.find_element(By.CLASS_NAME, 'el').get_attribute('href') #招聘详情
            #print(job_href)

            #获取当前窗口句柄A
            handle = web.current_window_handle
            try:
                job.click()# 打开招聘详情页
            except:
                continue  #如果当前招聘详情页打不开, 跳过当前招聘公司
            #获得当前所有窗口句柄(窗口A、B)
            handles = web.window_handles
            time.sleep(5)    # 防止加载缓慢,休眠5秒
            for newhandle in handles:
                #筛选新打开的窗口B
                if newhandle != handle:
                    web.switch_to.window(newhandle)#切换到新打开的窗口B
                    d_html = web.page_source
                    html = etree.HTML(d_html)
                    # dt = html.xpath('//div[@class=bmsg.job_msg.inbox]')
                    job_details = html.xpath('/html/body/div[2]/div[2]/div[3]/div[1]/div/text()')
                    ##详情页面工作职责和任职资格划分
                    job_res, job_zige = modify_detail(job_details)
                    web.close()
                    dit = {
                        "岗位名称": jobName,
                        "公司名称": jobCompany,
                        "薪资": jobSalary,
                        "公司规模": company_type_size,
                        "所属行业": company_status,
                        "工作地点": address,
                        "工作经验":experience,
                        "学历要求":edu,
                        "关键词": job_welf,
                        "发布日期": update_date,
                        "招聘详情": job_href,
                        "岗位职责": job_res,
                        "任职资格": job_zige
                    }
                    print(f'正在爬取{jobCompany}公司')
                    print(jobName, jobSalary, jobCompany, company_type_size, company_status, address, experience, edu,
                          job_welf,update_date,job_href,job_res,job_zige)
                    csv_write.writerow(dit)
                web.switch_to.window(handles[0])
            time.sleep(10)


  • 1
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
以下是使用PythonSelenium爬取前程无忧多页招聘信息的示例代码: ```python from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By import time # 设置Chrome浏览器驱动路径 driver_path = '/path/to/chromedriver' # 创建Chrome浏览器实例 options = webdriver.ChromeOptions() options.add_argument('--headless') # 无头模式 driver = webdriver.Chrome(driver_path, options=options) # 打开前程无忧网站 driver.get('https://www.51job.com/') # 找到搜索框并输入关键词 search_box = driver.find_element(By.ID, 'kwdselectid') search_box.send_keys('Python') search_box.send_keys(Keys.RETURN) # 遍历多页搜索结果并提取招聘信息 page_num = 1 while True: print(f'正在爬取第{page_num}页...') job_list = driver.find_elements(By.CLASS_NAME, 'j_joblist') for job in job_list: print(job.text) print('---------------------') # 找到下一页按钮并点击 next_page = driver.find_element(By.CLASS_NAME, 'bk') if 'bkdisabled' in next_page.get_attribute('class'): break # 已到最后一页,结束循环 next_page.click() page_num += 1 time.sleep(2) # 等待页面加载 # 关闭浏览器 driver.quit() ``` 这个示例代码使用了Selenium的WebDriver模块来模拟浏览器操作,包括打开网页、输入关键词、点击搜索按钮、翻页等。通过不断地在页面中查找招聘信息元素,可以逐一提取每个招聘信息的文本内容。然后,找到下一页按钮并点击,重复以上步骤,直到到达最后一页为止。最后,关闭浏览器实例。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值