前程无忧网招聘信息爬取

本文介绍如何使用Python进行网络爬虫,详细解析爬取前程无忧网站上的招聘信息的过程,包括设置请求头、解析HTML、提取关键数据等技术要点。
摘要由CSDN通过智能技术生成
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import json
import xlwt
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
worksheet.write(0, 0, 'job_name')
worksheet.write(0, 1, 'company_name')
worksheet.write(0, 2, 'company_href')
worksheet.write(0, 3, 'companytype_text')
worksheet.write(0, 4, 'workarea_text')
worksheet.write(0, 5, 'attribute_text')
worksheet.write(0, 6, 'companysize_text')
worksheet.write(0, 7, 'companyind_text')
worksheet.write(0, 8, 'salary_text')
worksheet.write(0, 9, 'jobwelf_list')
worksheet.write(0, 10, 'updatedate')
for a in range(1,6):
    url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{0}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(a)
    header = {
   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
    cf=Request(headers=header,url=url)
    html=urlopen(cf)
    bs = BeautifulSoup(html,'html.parser')

    alls=bs.find_all('script',type='text/javascript')
    for x in alls:
        t=x.get_text()
        if len(t)>0:
            t1=t.replace('window.__SEARCH_RESULT__ = ',' ')
            t2=json.loads(t1)
            y=t2["engine_search_result"]
            for i in range(0,len(y)):
                job_name =y[i]['job_name']
                company_name=y[i]['company_name']
                company_href=y[i]['company_href']
                companytype_text=y[i]['companytype_text']
                workarea_text=y[i]['workarea_text']
                attribute_text=y[i]['attribute_text']
                companysize_text=y[i]['companysize_text']
                companyind_text=y[i]['companyind_text']
                salary_text=y[i]['providesalary_text']
                jobwelf_list=y[i]['jobwelf_list']
                updatedate=y[i]['updatedate']
                print(job_name,company_name,company_href,companytype_text,workarea_text,attribute_text,companysize_text,companyind_text,salary_text,jobwelf_list,updatedate)


                worksheet.write(i+(a-1)*len(y)+1,0,job_name)
                worksheet.write(i+(a-1)*len(y)+1,1,company_name)
                worksheet.write(i+(a-1)*len(y)+1,2, company_href)
                worksheet.write(i+(a-1)*len(y)+1,3, companytype_text)
                worksheet.write(i+(a-1)*len(y)+1,4,workarea_text)
                worksheet.write(i+(a-1)*len(y)+1,5,attribute_text)
                worksheet.write(i+(a-1
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值