selenium 项目实战笔记-抓取51job职位薪资并分析

from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get("https://www.51job.com/")
# 输入职位信息
driver.find_element_by_id('kwdselectid').send_keys('自动化测试')
# 打开选择地区
driver.find_element_by_id('work_position_click').click()
time.sleep(1)
# 取消已经选择的城市
eles = driver.find_elements_by_css_selector('#work_position_click_multiple_selected>span')
for ele in eles:
    ele.click()
driver.find_element_by_xpath('//td[@class="js_more"]/em[text()="上海"]').click()
time.sleep(1)
# 点击确定
driver.find_element_by_id('work_position_click_bottom_save').click()
time.sleep(2)
# 点击搜索
driver.find_element_by_xpath('//button[text()="搜索"]').click()
# # 抓取职位信息
# # jobs = driver.find_elements_by_xpath('//div[@class="j_joblist"]//p[@class="t"]/span[1]')
# jobs = driver.find_elements_by_xpath('//div[@class="j_joblist"]//p[@class="t"]/span')
# # jobs = driver.find_elements_by_xpath('//div[@class="j_joblist"]/div[@class="e"]')
# for job in jobs:
#     # print('|'.join(job.text.split('\n')))
#     # print(job.text)
pagemsg = driver.find_element_by_xpath('//div[@class="p_in"]/span').text
import re
pagenum =int(re.findall('\d+',pagemsg)[0])
print(f'一个有{pagenum}页')
job_collection=[]
for i in range((pagenum-30)):
    print("开始获取第%s页信息" %(i+1))
    # 抓取职位信息
    time.sleep(2)
    # jobs = driver.find_elements_by_xpath('//div[@class="j_joblist"]//p[@class="t"]/span')
    # jobs = driver.find_elements_by_xpath('//div[@class="j_joblist"]/div[@class="e"]//span')
    jobs = driver.find_elements_by_xpath('//div[@class="j_joblist"]//p[@class="info"]//span[1]')
    time.sleep(3)
    for job in jobs:
        job_collection.append('|'.join(job.text.split('\n')))
        # job_collection.append(print(job.text))

    if i <pagenum-1:
        # 点击下一页
        driver.find_element_by_css_selector('.p_in li:nth-last-child(1)').click()
        time.sleep(1)
    driver.find_element_by_css_selector('.j_result')  # 稳定页面
print("总共获取%s职位信息" %(len(job_collection)))
driver.quit()
with open('salaries.txt','w',encoding='utf-8') as f:
    for job in job_collection:
        f.write(job+'\n')

import re
def convert_data():
    with open('salaries.txt','r',encoding="utf-8") as f:
        salaries=f.read()
    sass = []  #总的工资统计数据
    for sa in salaries.split('\n'):
        # print(sa)
        # 提取目标数据sa
        res = re.findall('\d+\.?\d*-\d+\.?\d*',sa)
        # print(res)
    # 统一单位
        if "万/月" in sa:  #结果*10
            sass.append([float(one)* 10 for one in res[0].split('-')])
        elif '千/月' in sa:
            sass.append([float(one) for one in res[0].split('-')])
        elif '万/年' in sa:
            sass.append([float(one)/1.2 for one in res[0].split('-')])
        else:
            print("垃圾数据---%s" %sa)
        # print(sass)
    return sass
def analysis_data(sass):

    data_dict = {
        '5-10K': [],
        '10-15K': [],
        '15-20K': [],
        '20-25K': [],
        '25-30K': [],
        '30K+': [],
    }
    for sa in sass:
        # 收集5-10K的数据
        if sa[0] in range(5,10) or sa[1] in range(5,10):
            data_dict['5-10K'].append(sa)
        if sa[0] in range(10,15) or sa[1] in range(10,15):
            data_dict['10-15K'].append(sa)
        if sa[0] in range(15,20) or sa[1] in range(15,20):
            data_dict['15-20K'].append(sa)
        if sa[0] in range(20,25) or sa[1] in range(20,25):
            data_dict['20-25K'].append(sa)
        if sa[0] in range(25,30) or sa[1] in range(25,30):
            data_dict['25-30K'].append(sa)
        if sa[1] > 30:
            data_dict['30K+'].append(sa)
    print(data_dict)
if __name__ == '__main__':
    convert_data()
    analysis_data(sass)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值