python实现对前程无忧的信息的爬取

1,使用的模块:

import codecs
import csv
import requests
import re
import json
import pprint

2,主要爬取内容:

        '职位名称',
        '基本信息',
        '公司名字',
        '工作地点',
        '公司类型',
        '公司规模',
        '公司性质',
        '福利',
        '工资',
        '信息发布时间',
        '职位详情页',

3,不固定url资源路径,通过if和elif对城市进行判断然后选择url,可以选择不同的城市。

4,最终实现代码

import codecs
import csv

import requests
import re
import json
import pprint
f = open(
    '前程无忧.csv',
    mode='a',
    encoding='utf-8-sig',
    newline='')
#创建一个csv文件,mode=a表示对文件只能写入,encoding是内容文字,newline避免有换行字符等产生
csv__ = csv.DictWriter(
    f,
    fieldnames = [
        '职位名称',
        '基本信息',
        '公司名字',
        '工作地点',
        '公司类型',
        '公司规模',
        '公司性质',
        '福利',
        '工资',
        '信息发布时间',
        '职位详情页']
)
#f是创建的csv文件,fieldnames表示列名
csv__.writeheader()
print("输入你的城市:")
str = input()
if str=='成都':
    url=\
        'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=' \
        '99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='北京':
    url = \
        'https://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str==\
        '上海':
    url = \
        'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str == '广州':
    url = \
        'https://search.51job.com/list/030200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='深圳':
    url = \
        'https://search.51job.com/list/040000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='武汉':
    url = \
        'https://search.51job.com/list/180200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='西安':
    url = \
        'https://search.51job.com/list/200200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='杭州':
    url = \
        'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='南京':
    url = \
        'https://search.51job.com/list/070200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='重庆':
    url = \
        'https://search.51job.com/list/060000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='东莞':
    url = \
        'https://search.51job.com/list/030800,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='大连':
    url = \
        'https://search.51job.com/list/230300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='沈阳':
    url = \
        'https://search.51job.com/list/230200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='苏州':
    url = \
        'https://search.51job.com/list/070300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
#不固定url资源路径,通过if和elif对城市进行判断然后选择url
# url='https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29'
}#头文件模仿网页对网页进行爬虫

response = requests.get(
    url=url,
    headers=headers
)
#获取资源路径下的网页文件
print(response.text)
html_data=re.findall(
    'window.__SEARCH_RESULT__ =(.*?)</script>',
    response.text)[0]
#运用正则表达式findall找到需要的资源,[0]表示爬取出来的是字符串
json_data=json.loads(html_data)
#用json.loads对获取到的字符串进行解码返回python字段
# pprint.pprint(json)

engine=json_data['engine_jds']
#找到这个字段的内容
pprint.pprint(engine)
for i in engine:
    # pprint.pprint(i)
    title=i['job_name']
    attribute_text=i['attribute_text']
    jjj = ' '.join(attribute_text)
    company_name=i['company_name']
    companyind_text=i['companyind_text']
    companysize_text=i['companysize_text']
    companytype_text=i['companytype_text']
    jobwelf=i['jobwelf']
    providesalary_text=i['providesalary_text']
    updatedate=i['updatedate']
    job_href=i['job_href']
    workarea_text=i['workarea_text']
    #对找到的列表拆分为多个字典内容

    dit={
        '职位名称':title,
        '基本信息':jjj,
        '公司名字':company_name,
        '工作地点':workarea_text,
        '公司类型':companyind_text,
        '公司规模':companysize_text,
        '公司性质':companytype_text,
        '福利':jobwelf,
        '工资':providesalary_text,
        '信息发布时间':updatedate,
        '职位详情页':job_href

    }
    #把拆分的数据整合进一个新的字典
    csv__.writerow(dit)
    #把dit字典内容写进csv文件


5,结果:

 

  • 6
    点赞
  • 47
    收藏
    觉得还不错? 一键收藏
  • 5
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值