51job, request获取内容,正则筛选求职岗位,薪资等信息

网页展示

在这里插入图片描述

代码

import requests, re

import xlwt#excel写模块


#1- 创建一个excel文件对象
def excel_init():
    workBook = xlwt.Workbook(encoding='utf-8')
    #2- 在excel对象新建一个sheet
    workSheet = workBook.add_sheet("51job")
    colName = ['岗位','公司名称','地址','薪资','发布时间']
    #excel--sheet   列与行都是从下标 0 开始的
    for one in range(len(colName)):
        #写列名
        workSheet.write(0,one,colName[one])#write(行数,列数,字符串内容)

    #最后一步--保存
    return workBook,workSheet
    #workBook.save('D:\51job.xls')

workBook, workSheet = excel_init()  # 完成excel初始化操作
web_url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

# 设置读取网页的头部,该行代码主要用于模拟浏览器来访问网站
user_header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
# https://search.51job.com/
# 51job

# ----封装一个 获取页数的函数
def get_pages(url):
    # 1- 使用requests构建请求
    # <span class="td">28页,到第</span>
    resp = requests.get(url, headers=user_header)
    # 2- 获取响应数据
    resp.encoding = 'gbk'  # 设置编码
    # print(resp.text)
    pages = re.findall('"total_page":"(.*?)",', resp.text, re.S)[0].strip() #获取总页数
    return int(pages)


print('----------->:', get_pages(web_url))

'''
反扒机制:
   1- 判别是否是浏览器--后果:远程主机强迫关闭了一个现有的连接---解决方案:模拟浏览器
   2- 验证码--
   3- 封ip
   4- 封账号
'''
row = 1  # 行号----初始值
for page in range(1, get_pages(web_url) + 1):  #其实此时的page没有在后面用到
    # 1- 使用requests构建请求
    web_url = f'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595,2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    resp = requests.get(web_url, headers=user_header)
    # 2- 获取响应数据
    resp.encoding = 'gbk'  # 设置编码
    # print(resp.text)#响应内容--字符串
    # print(resp.content)#byte格式
    # 3- 提取有效数据
    '''
    "job_href":"https:\/\/jobs.51job.com\/shanghai-qpq\/128395583.html?s=01&t=0",
    "job_name":"自动化测试工程师","job_title":"自动化测试工程师",
    "company_href":"https:\/\/jobs.51job.com\/all\/co1980881.html",
    "company_name":"上海宽文是风软件有限公司","providesalary_text":"1-1.5万\/月",
    "workarea":"021600","workarea_text":"上海-青浦区","updatedate":"01-16","isIntern":"","iscommunicate":"",
    "companytype_text":"民营公司","degreefrom":"6","workyear":"4","issuedate":"2021-01-16 04:00:41","isFromXyz":"",
    "jobwelf":"五险一金 弹性工作","jobwelf_list":["五险一金","弹性工作"],job_href
    "attribute_text":["上海-青浦区","2年经验","本科","招1人"],"companysize_text":"150-500人",
    "companyind_text":"计算机软件","adid":""},{"type":"engine_search_result","jt":"0","tags":[],
    "ad_track":"","jobid":"128090693","coid":"5131194","effect":"1","is_special_job":"",
    
    "job_href":"https:\/\/jobs.51job.com\/shanghai-pdxq\/128090693.html?s=01&t=0",
    "job_name":"自动化测试工程师","job_title":"自动化测试工程师",
    "company_href":"https:\/\/jobs.51job.com\/all\/co5131194.html","company_name":"江苏金楷泽互联网科技有限公司",
    "providesalary_text":"1-1.5万\/月","workarea":"021000","workarea_text":"上海-浦东新区",
    "updatedate":"01-16","isIntern":"","iscommunicate":"","companytype_text":"民营公司","degreefrom":"6",
    "workyear":"4","issuedate":"2021-01-16 04:00:41","isFromXyz":"","jobwelf":"","jobwelf_list":[""],"attribute_text":["上海-浦东新区","2年经验","本科","招2人"],"companysize_text":"50-150人","companyind_text":"计算机软件","adid":""},{"type":"engine_search_result","jt":"0","tags":[],"ad_track":"","jobid":"126661042","coid":"1505344","effect":"1",
    "is_special_job":"",
 
    '''

    lines = re.findall('"job_href"(.*?)"is_special_job":"",', resp.text, re.S)  # 返回是list

    # 获取每一行具体数据
    for line in lines:
        jobName = re.findall('"job_name":"(.*?)","job_title"', line, re.S)
        #print(jobName)
        # 1- 获取岗位名称

        workSheet.write(row, 0, jobName)
        # 2- 获取公司名称
        company = re.findall('"company_name":"(.*?)","providesalary_text"', line, re.S)
        
        workSheet.write(row, 1, company)
        # 3- 获取地址
        address = re.findall('"workarea_text":"(.*?)","updatedate":', line, re.S)
        
        workSheet.write(row, 2, address)
        # 4- 获取薪资
        salary = re.findall('"providesalary_text":"(.*?)","workarea"', line, re.S)
        
        workSheet.write(row, 3, salary)
        # 5- 发布时间
        jobTime = re.findall('"updatedate":"(.*?)","isIntern"', line, re.S)
        
        workSheet.write(row, 4, jobTime)
        row += 1

# 4- 存储excel
workBook.save('D:\\pythonCourse\\51job.xls')  # 关闭文件才操作

'''
1- https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
2- https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=



'''

excel 保存获取结果

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值