python实现对前程无忧的信息的爬取

最新推荐文章于 2024-05-01 22:08:22 发布

takumicc

最新推荐文章于 2024-05-01 22:08:22 发布

阅读量3.8k

点赞数 6

文章标签： python 开发语言后端

本文链接：https://blog.csdn.net/weixin_43770486/article/details/122056883

版权

1，使用的模块：

import codecs
import csv
import requests
import re
import json
import pprint

2，主要爬取内容：

        '职位名称',
        '基本信息',
        '公司名字',
        '工作地点',
        '公司类型',
        '公司规模',
        '公司性质',
        '福利',
        '工资',
        '信息发布时间',
        '职位详情页',

3,不固定url资源路径,通过if和elif对城市进行判断然后选择url，可以选择不同的城市。

4，最终实现代码

import codecs
import csv

import requests
import re
import json
import pprint
f = open(
    '前程无忧.csv',
    mode='a',
    encoding='utf-8-sig',
    newline='')
#创建一个csv文件，mode=a表示对文件只能写入，encoding是内容文字，newline避免有换行字符等产生
csv__ = csv.DictWriter(
    f,
    fieldnames = [
        '职位名称',
        '基本信息',
        '公司名字',
        '工作地点',
        '公司类型',
        '公司规模',
        '公司性质',
        '福利',
        '工资',
        '信息发布时间',
        '职位详情页']
)
#f是创建的csv文件，fieldnames表示列名
csv__.writeheader()
print("输入你的城市:")
str = input()
if str=='成都':
    url=\
        'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=' \
        '99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='北京':
    url = \
        'https://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str==\
        '上海':
    url = \
        'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str == '广州':
    url = \
        'https://search.51job.com/list/030200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='深圳':
    url = \
        'https://search.51job.com/list/040000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='武汉':
    url = \
        'https://search.51job.com/list/180200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='西安':
    url = \
        'https://search.51job.com/list/200200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='杭州':
    url = \
        'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='南京':
    url = \
        'https://search.51job.com/list/070200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='重庆':
    url = \
        'https://search.51job.com/list/060000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='东莞':
    url = \
        'https://search.51job.com/list/030800,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='大连':
    url = \
        'https://search.51job.com/list/230300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='沈阳':
    url = \
        'https://search.51job.com/list/230200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='苏州':
    url = \
        'https://search.51job.com/list/070300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
        ',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm' \
        '=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
#不固定url资源路径,通过if和elif对城市进行判断然后选择url
# url='https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29'
}#头文件模仿网页对网页进行爬虫

response = requests.get(
    url=url,
    headers=headers
)
#获取资源路径下的网页文件
print(response.text)
html_data=re.findall(
    'window.__SEARCH_RESULT__ =(.*?)</script>',
    response.text)[0]
#运用正则表达式findall找到需要的资源，[0]表示爬取出来的是字符串
json_data=json.loads(html_data)
#用json.loads对获取到的字符串进行解码返回python字段
# pprint.pprint(json)

engine=json_data['engine_jds']
#找到这个字段的内容
pprint.pprint(engine)
for i in engine:
    # pprint.pprint(i)
    title=i['job_name']
    attribute_text=i['attribute_text']
    jjj = ' '.join(attribute_text)
    company_name=i['company_name']
    companyind_text=i['companyind_text']
    companysize_text=i['companysize_text']
    companytype_text=i['companytype_text']
    jobwelf=i['jobwelf']
    providesalary_text=i['providesalary_text']
    updatedate=i['updatedate']
    job_href=i['job_href']
    workarea_text=i['workarea_text']
    #对找到的列表拆分为多个字典内容

    dit={
        '职位名称':title,
        '基本信息':jjj,
        '公司名字':company_name,
        '工作地点':workarea_text,
        '公司类型':companyind_text,
        '公司规模':companysize_text,
        '公司性质':companytype_text,
        '福利':jobwelf,
        '工资':providesalary_text,
        '信息发布时间':updatedate,
        '职位详情页':job_href

    }
    #把拆分的数据整合进一个新的字典
    csv__.writerow(dit)
    #把dit字典内容写进csv文件

5，结果：