1,使用的模块:
import codecs
import csv
import requests
import re
import json
import pprint
2,主要爬取内容:
'职位名称',
'基本信息',
'公司名字',
'工作地点',
'公司类型',
'公司规模',
'公司性质',
'福利',
'工资',
'信息发布时间',
'职位详情页',
3,不固定url资源路径,通过if和elif对城市进行判断然后选择url,可以选择不同的城市。
4,最终实现代码
import codecs
import csv
import requests
import re
import json
import pprint
f = open(
'前程无忧.csv',
mode='a',
encoding='utf-8-sig',
newline='')
#创建一个csv文件,mode=a表示对文件只能写入,encoding是内容文字,newline避免有换行字符等产生
csv__ = csv.DictWriter(
f,
fieldnames = [
'职位名称',
'基本信息',
'公司名字',
'工作地点',
'公司类型',
'公司规模',
'公司性质',
'福利',
'工资',
'信息发布时间',
'职位详情页']
)
#f是创建的csv文件,fieldnames表示列名
csv__.writeheader()
print("输入你的城市:")
str = input()
if str=='成都':
url=\
'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=' \
'99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='北京':
url = \
'https://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str==\
'上海':
url = \
'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str == '广州':
url = \
'https://search.51job.com/list/030200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='深圳':
url = \
'https://search.51job.com/list/040000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='武汉':
url = \
'https://search.51job.com/list/180200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='西安':
url = \
'https://search.51job.com/list/200200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='杭州':
url = \
'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='南京':
url = \
'https://search.51job.com/list/070200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='重庆':
url = \
'https://search.51job.com/list/060000,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='东莞':
url = \
'https://search.51job.com/list/030800,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='大连':
url = \
'https://search.51job.com/list/230300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='沈阳':
url = \
'https://search.51job.com/list/230200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
elif str=='苏州':
url = \
'https://search.51job.com/list/070300,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B' \
',2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm' \
'=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
#不固定url资源路径,通过if和elif对城市进行判断然后选择url
# url='https://search.51job.com/list/090200,000000,0000,00,9,99,%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25B7%25A5%25E7%25A8%258B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29'
}#头文件模仿网页对网页进行爬虫
response = requests.get(
url=url,
headers=headers
)
#获取资源路径下的网页文件
print(response.text)
html_data=re.findall(
'window.__SEARCH_RESULT__ =(.*?)</script>',
response.text)[0]
#运用正则表达式findall找到需要的资源,[0]表示爬取出来的是字符串
json_data=json.loads(html_data)
#用json.loads对获取到的字符串进行解码返回python字段
# pprint.pprint(json)
engine=json_data['engine_jds']
#找到这个字段的内容
pprint.pprint(engine)
for i in engine:
# pprint.pprint(i)
title=i['job_name']
attribute_text=i['attribute_text']
jjj = ' '.join(attribute_text)
company_name=i['company_name']
companyind_text=i['companyind_text']
companysize_text=i['companysize_text']
companytype_text=i['companytype_text']
jobwelf=i['jobwelf']
providesalary_text=i['providesalary_text']
updatedate=i['updatedate']
job_href=i['job_href']
workarea_text=i['workarea_text']
#对找到的列表拆分为多个字典内容
dit={
'职位名称':title,
'基本信息':jjj,
'公司名字':company_name,
'工作地点':workarea_text,
'公司类型':companyind_text,
'公司规模':companysize_text,
'公司性质':companytype_text,
'福利':jobwelf,
'工资':providesalary_text,
'信息发布时间':updatedate,
'职位详情页':job_href
}
#把拆分的数据整合进一个新的字典
csv__.writerow(dit)
#把dit字典内容写进csv文件
5,结果: