51job数据分析岗位信息爬取

对2022年51job招聘网站上数据分析岗应届生的招聘信息进行爬取

在爬取数据时,不登陆帐户cookie不会发生变化,不知道登陆后是否会发生变化。。。
下边是代码
导入需要的包

import requests 
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

用BS4对网页源代码进行解析,用正则表达式提取所需信息。

data = pd.DataFrame(columns=[['公司名称','公司地点','职位名称','学历','经验','薪资','行业','公司规模','是否上市','职位福利']],index=range(1000))
n = 0

data为目标输出,n为初始化索引值。

header = {
    'Cookie': 'guid=ce93793161b5efe686f678f633b4a5a0; _ujz=MTk5Njc4NDU5MA%3D%3D; ps=needv%3D0; 51job=cuid%3D199678459%26%7C%26cusername%3DFZrRRqyPd%252Fv5%252Bsi3IB0rF6fY2wniQApfpwFm5i1%252BQqM%253D%26%7C%26cpassword%3D%26%7C%26cname%3D2XpJWSS7j7QVRIma%252BwKcGQ%253D%253D%26%7C%26cemail%3D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0PU14bXOQA1k%26%7C%26cconfirmkey%3D%25241%2524%252FwUjwv3I%2524iY5H8jjaEYbDAXrop.ER6%252F%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D0%26%7C%26cnamekey%3D%25241%2524hC0%252Ffm0B%2524grFZqVxGBVsNIq2Je5y7Q1%26%7C%26to%3D673e7d0dc8edaf405d4f99828ddd76f66189eeba%26%7C%26; adv=ad_logid_url%3Dhttps%253A%252F%252Ftrace.51job.com%252Ftrace.php%253Fpartner%253Dsem_pcbaidu7_134408%2526ajp%253DaHR0cHM6Ly9ta3QuNTFqb2IuY29tL3RnL3NlbS9MUF8yMDIwXzEuaHRtbD9mcm9tPWJhaWR1YWQ%253D%2526k%253Dd946ba049bfb67b64f408966cbda3ee9%2526bd_vid%253D8584349858831256702%26%7C%26; partner=sem_pcbaidupz_2; slife=lastlogindate%3D20211117%26%7C%26; privacy=1637140590; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA01%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA04%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA01%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA05%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA01%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21; ssxmod_itna=iqGxg7Dti=0=0QDOFDXKG7AHG=X96YOknY+R7iqGXL3DZDiqAPGhDC+bzYhG3+74he=W6G0q=dK7ic0YjF7WawYoWrBfK3DU4i8DCT+KTD4RKGwD0eG+DD4DWYqAoDexGp9uEyKGWD4qDRDAQDzMLyDG3gDYp9tqDglqDBGEdDKqGg8wUUzWidxGU5AM+d8qDM0eGXFia48TRakpDlAknbpo+KD0ps9Pmw97yeFWpuYmiDtqD9ej=DbSddyPtZFnvxWo+WSE33FGp5UYxW4tp5m7xqzGGx8A+de+GP8hwKnfvQYDDAFv+lPD; ssxmod_itna2=iqGxg7Dti=0=0QDOFDXKG7AHG=X96YOknY+R7DA6nvpdD/zbUDFg27u9Z=+UvMa48dAxl+ihH8lSRqkPHzE5h+kzpqidcWKmKFvC5BaQHv7T8cCtwzXj=2df8y8PkNjm6b1MpP2dSusm6WNq/yFelDTnjv+MxPNDRcFi/Ykk6nwNDuw0Ooxdbaqz4YE=iuKqlvxe60qt4AxCFe4nQiefFqare2s8aOkPxMXjh5mypHQzrWOwxQFLrAHVmENKCfk3Bd7aq1a/b6XoHw1i8IhMFmLXld7f/knVxIscetaMSyXpqHg86ybIEoiSW8oCxl5Fgt7T8KwdyO=oCOibQ7xkFh+U2C6gs3TdG23muDNq1/AY5gNL2NWrmDgzTmri3dAgP7QC7eTnj=owGrGw+Wddjd8OCRrGTStOCbelYsqHTwprhvnhoOhHib36ozVnADndtrPD7QixGcDG7eiDD===',
    'Host': 'search.51job.com',
    'Referer': 'https://www.51job.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
pages = range(1,21) # 因无关职位过多,因此只抓取前二十页数据

51job上数据分析岗的招聘信息有好多页,但20页往后的信息不太匹配就不进行爬取。

regex1 = re.compile(r'"engine_jds":(?P<text>.*?),"jobid_count"',re.S)

对源代码进行解析后发现我们所需要的数据需要使用正则表达式进行提取。

for page in pages:
    print(f'正在抓取第{page}页数据')
    # 每一页的链接
    url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,{page}.html?lang=c&postchannel=0000&workyear=01&cotype=99&degreefrom=04%252c05&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    
    resp = requests.get(url,headers=header)
    text = resp.text # 获取页面文本
    resp.close()
    
    text = text.replace('\\','') # 将文本中的“\\”替换掉
    
    soup = BeautifulSoup(text,'html.parser') # 获取BS4对象

    script = soup.find_all('script',attrs = {'type':"text/javascript"})[2] # 获取包含职位信息的标签

    string = regex1.findall(str(script))[0] # 获取包含职位信息的字符串  

    job_info = eval(string) # 使用 evel函数将字符串转换为列表
    ['公司名称','公司地点','职位名称','学历','经验','薪资','行业','公司规模','是否上市','职位福利','href']

    for temp in job_info:
        data.loc[n,'公司名称'] = temp['company_name']
        data.loc[n,'公司地点'] = temp['workarea_text']
        data.loc[n,'职位名称'] = temp['job_name']
        data.loc[n,'学历'] = temp['attribute_text'][2]
        data.loc[n,'经验'] = temp['attribute_text'][1]
        data.loc[n,'薪资'] = temp['providesalary_text']
        data.loc[n,'行业'] = temp['companyind_text']
        data.loc[n,'公司规模'] = temp['companysize_text']
        data.loc[n,'是否上市'] = temp['companytype_text']
        data.loc[n,'职位福利'] = temp['jobwelf']
        
        n += 1
    print(f'第{page}页抓取完毕,停顿一分钟后开始下一次循环')
    time.sleep(60)
    
print('-----end-----')
  • 0
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值