招聘网站职位信息python_Python实战--爬取51job招聘网站数据分析师职位信息-CSDN博客

本文链接：https://blog.csdn.net/weixin_39900531/article/details/111449666

import requests

from lxml import etree

def get_html(url):# 获得网页源代码

header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'}

try:

response=requests.get(url,headers=header)

if response.status_code==200:

return response.content.decode("gbk")

else:

print("响应状态码错误!")

except Exception as e:

print("请求出现错误,错误类型:%s"%e)

def parse_html(html):# 解析网页内容

html=etree.HTML(html)

# print(html)

# 获取职位名称

zwmc=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/p/span/a/text()')

# print(zwmc)

# 获取公司名称

gsmc=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/span[1]/a/text()')

# print(gsmc)

# 获取工作地点

gzdd=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/span[2]/text()')

# print(gzdd)

# 获取薪酬

xc=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/span[3]')

new_xc=[]

for x in xc:

new_xc.append(x.xpath('string(.)'))# map函数更方便

# print(new_xc)

# 获取发布时间

fbsj=html.xpath('/html/body/div[2]/div[4]/div[@class="el"]/span[4]/text()')

# print(fbsj)

tlist=[]

# 打印每个信息的数量

# print(len(zwmc),len(gsmc),len(gzdd),len(new_xc),len(fbsj))

for i in range(len(zwmc)):

if len(new_xc[i])!=0:

if new_xc[i][-3:]=='万/月':

low,high=new_xc[i][:-3].split('-')

low=float(low)*10000

high=float(high)*10000

elif new_xc[i][-3:]=='千/月':

low,high = new_xc[i][:-3].split('-')

low=float(low)*1000

high=float(high)*1000

elif new_xc[i][-3:]=='万/年':

low,high=new_xc[i][:-3].split('-')

low=float(low)*10000/12

high=float(high)*10000/12

elif new_xc[i][-3:]=='元/天':# 200元/天

low=high=float(new_xc[i][:-3])*23

elif new_xc[i][-4:]=='元/小时':# 30元/小时

low=high=float(new_xc[i][:-4])*8*23

else:

print("出现异常值:%s"%new_xc[i])

low=high='异常值'

else:

low=high=0

tlist.append([zwmc[i].strip(),gsmc[i],gzdd[i].split("-")[0],str(low),str(high),fbsj[i]])

return tlist

def save_to_csv(tlist):

with open("51job数据分析师.csv",'a',encoding='utf-8-sig')as f:

for line in tlist:

f.write(','.join(line))# 用逗号将序列中的元素拼接起来,返回字符串

f.write('\n')

def main():# 定义主函数

with open("51job数据分析师.csv",'w',encoding='utf-8-sig')as f:

f.write('职位名称,公司名称,公司地点,最低薪酬,最高薪酬,发布时间\n')

for i in range(1,101):

start_url='https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(i)

print("正在访问%s页"%i)

# 第一步:获取网页源代码

html=get_html(start_url)

# 第二步:解析网页

info=parse_html(html)

# 第三步:保存数据

save_to_csv(info)

if __name__ == '__main__':

main()# 调用主函数共爬去5000条数据分析师职位信息