对2022年51job招聘网站上数据分析岗应届生的招聘信息进行爬取
在爬取数据时,不登陆帐户cookie不会发生变化,不知道登陆后是否会发生变化。。。
下边是代码
导入需要的包
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
用BS4对网页源代码进行解析,用正则表达式提取所需信息。
data = pd.DataFrame(columns=[['公司名称','公司地点','职位名称','学历','经验','薪资','行业','公司规模','是否上市','职位福利']],index=range(1000))
n = 0
data为目标输出,n为初始化索引值。
header = {
'Cookie': 'guid=ce93793161b5efe686f678f633b4a5a0; _ujz=MTk5Njc4NDU5MA%3D%3D; ps=needv%3D0; 51job=cuid%3D199678459%26%7C%26cusername%3DFZrRRqyPd%252Fv5%252Bsi3IB0rF6fY2wniQApfpwFm5i1%252BQqM%253D%26%7C%26cpassword%3D%26%7C%26cname%3D2XpJWSS7j7QVRIma%252BwKcGQ%253D%253D%26%7C%26cemail%3D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0PU14bXOQA1k%26%7C%26cconfirmkey%3D%25241%2524%252FwUjwv3I%2524iY5H8jjaEYbDAXrop.ER6%252F%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D0%26%7C%26cnamekey%3D%25241%2524hC0%252Ffm0B%2524grFZqVxGBVsNIq2Je5y7Q1%26%7C%26to%3D673e7d0dc8edaf405d4f99828ddd76f66189eeba%26%7C%26; adv=ad_logid_url%3Dhttps%253A%252F%252Ftrace.51job.com%252Ftrace.php%253Fpartner%253Dsem_pcbaidu7_134408%2526ajp%253DaHR0cHM6Ly9ta3QuNTFqb2IuY29tL3RnL3NlbS9MUF8yMDIwXzEuaHRtbD9mcm9tPWJhaWR1YWQ%253D%2526k%253Dd946ba049bfb67b64f408966cbda3ee9%2526bd_vid%253D8584349858831256702%26%7C%26; partner=sem_pcbaidupz_2; slife=lastlogindate%3D20211117%26%7C%26; privacy=1637140590; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA01%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA04%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA01%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA05%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA01%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21; ssxmod_itna=iqGxg7Dti=0=0QDOFDXKG7AHG=X96YOknY+R7iqGXL3DZDiqAPGhDC+bzYhG3+74he=W6G0q=dK7ic0YjF7WawYoWrBfK3DU4i8DCT+KTD4RKGwD0eG+DD4DWYqAoDexGp9uEyKGWD4qDRDAQDzMLyDG3gDYp9tqDglqDBGEdDKqGg8wUUzWidxGU5AM+d8qDM0eGXFia48TRakpDlAknbpo+KD0ps9Pmw97yeFWpuYmiDtqD9ej=DbSddyPtZFnvxWo+WSE33FGp5UYxW4tp5m7xqzGGx8A+de+GP8hwKnfvQYDDAFv+lPD; ssxmod_itna2=iqGxg7Dti=0=0QDOFDXKG7AHG=X96YOknY+R7DA6nvpdD/zbUDFg27u9Z=+UvMa48dAxl+ihH8lSRqkPHzE5h+kzpqidcWKmKFvC5BaQHv7T8cCtwzXj=2df8y8PkNjm6b1MpP2dSusm6WNq/yFelDTnjv+MxPNDRcFi/Ykk6nwNDuw0Ooxdbaqz4YE=iuKqlvxe60qt4AxCFe4nQiefFqare2s8aOkPxMXjh5mypHQzrWOwxQFLrAHVmENKCfk3Bd7aq1a/b6XoHw1i8IhMFmLXld7f/knVxIscetaMSyXpqHg86ybIEoiSW8oCxl5Fgt7T8KwdyO=oCOibQ7xkFh+U2C6gs3TdG23muDNq1/AY5gNL2NWrmDgzTmri3dAgP7QC7eTnj=owGrGw+Wddjd8OCRrGTStOCbelYsqHTwprhvnhoOhHib36ozVnADndtrPD7QixGcDG7eiDD===',
'Host': 'search.51job.com',
'Referer': 'https://www.51job.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
pages = range(1,21) # 因无关职位过多,因此只抓取前二十页数据
51job上数据分析岗的招聘信息有好多页,但20页往后的信息不太匹配就不进行爬取。
regex1 = re.compile(r'"engine_jds":(?P<text>.*?),"jobid_count"',re.S)
对源代码进行解析后发现我们所需要的数据需要使用正则表达式进行提取。
for page in pages:
print(f'正在抓取第{page}页数据')
# 每一页的链接
url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,{page}.html?lang=c&postchannel=0000&workyear=01&cotype=99°reefrom=04%252c05&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
resp = requests.get(url,headers=header)
text = resp.text # 获取页面文本
resp.close()
text = text.replace('\\','') # 将文本中的“\\”替换掉
soup = BeautifulSoup(text,'html.parser') # 获取BS4对象
script = soup.find_all('script',attrs = {'type':"text/javascript"})[2] # 获取包含职位信息的标签
string = regex1.findall(str(script))[0] # 获取包含职位信息的字符串
job_info = eval(string) # 使用 evel函数将字符串转换为列表
['公司名称','公司地点','职位名称','学历','经验','薪资','行业','公司规模','是否上市','职位福利','href']
for temp in job_info:
data.loc[n,'公司名称'] = temp['company_name']
data.loc[n,'公司地点'] = temp['workarea_text']
data.loc[n,'职位名称'] = temp['job_name']
data.loc[n,'学历'] = temp['attribute_text'][2]
data.loc[n,'经验'] = temp['attribute_text'][1]
data.loc[n,'薪资'] = temp['providesalary_text']
data.loc[n,'行业'] = temp['companyind_text']
data.loc[n,'公司规模'] = temp['companysize_text']
data.loc[n,'是否上市'] = temp['companytype_text']
data.loc[n,'职位福利'] = temp['jobwelf']
n += 1
print(f'第{page}页抓取完毕,停顿一分钟后开始下一次循环')
time.sleep(60)
print('-----end-----')