文章目录
一、准备
1.数据
http://www.job.cn/job/list/35_47-0-0-0_0_0_0_0_0_0_0-0-0-0-1.html
2.数据字段
职位名称 薪资 要求学历 职位描述 公司名称 公司规模 发布日期
二、数据爬取
1.设置预设
headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.job.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
'Cache - Control': 'no-cache'
}
2.观察网页源码(部分)
岗位列表页
<div class="yunjoblist_newname"><a href="http://www.job.cn/job/1514227.html" class="yunjoblist_newname_a" target="_blank" title="系统需求工程师">系统需求工程师</a> </div>
岗位详情页
<div class="job_details_topleft">
<h1 class="job_details_name">软件工程师</h1>
<div class="job_details_salary">
<span class="job_details_salary_n">¥4500-6000</span> /月
</div>
<div class="job_details_info">
上海-上海
<span class="job_details_line">|</span>
不限经验
<span class="job_details_line">|</span>
大专学历
</div>
<div class="job_details_welfare ">
<span class="job_details_welfare_n ">91</span>
</div>
</div>
<div class="job_details_topright">
<div class="job_details_topright_data">
<span class="job_details_topright_data_time">2019-03-29 更新</span>
被浏览:<script src="http://www.job.cn/job/c_comapply-a_GetHits-id_1513941.html"></script>5 次
</div>
<div class="job_details_top_operation">
<a href="javascript:void(0);" onclick="showlogin('1');" class="job_details_top_operation_sc" rel="nofollow">收藏</a>
<a onclick="applyjobuid();" href="javascript:;" class="job_details_top_operation_sq">申请职位</a>
</div>
<div class="job_details_top_extension">
<div class="job_details_top_extension_zl">
<a href="javascript:void(0);" onclick="showlogin('1');" class="job_details_top_extension_tj" rel="nofollow">推荐</a>
</div>
<div class="job_details_top_extension_zl ">
<a href="javascript:void(0)" class="job_details_top_extension_jb" onclick="showlogin('1');" rel="nofollow">举报</a>
</div>
<div class="job_details_top_extension_zl">
<a href="javascript:void(0);" onmousemove="$('#getwapurl').show();" onmouseout="$('#getwapurl').hide();" class="job_details_top_extension_fx" rel="nofollow">分享</a>
<div class="comapply_sq_r_cy none" id="getwapurl">
<div class="comapply_sq_r_cont">
<div class="comapply_sq_r_tipa">微信扫一扫:分享</div>
<img src="http://www.job.cn/index.php?m=ajax&c=pubqrcode&toc=job&toa=view&toid=1513941" width="130" height="130">
<div class="comapply_sq_r_tipsm">
↑微信扫上方二维码↑<br>便可将本文分享至朋友圈
</div>
</div>
</div>
</div>
</div>
</div>
3.制定正则规则
# 职位名称
reg = '(?<=<h1 class="job_details_name">)\w*(?=</h1>)'
joblist = re.findall(reg,page_code)
# 薪资
req = '(?<=<span class="job_details_salary_n">¥)[\d\w]*[\|\-\/\~][\d\w]*(?=</span>)'
salarylist = re.findall(req,page_code)
# 要求学历
reg = '\w*学历(?=[[\r\n])'
degreelist = re.findall(reg,page_code)
# 职位描述
reg = '(<span class="job_details_describe_yq">婚况要求:不限</span>[\r\n\t]*)(.*)(?=[\r\n]*)'
detail = re.search(reg,page_code).group(2)
detaillist = list()
detaillist.append(detail)
# 公司名称
req = '(<a href="http://www.job.cn/company/c_show-id_\d*.html" target="_blank" class="">)(.*有限公司)</a>'
try:
company = re.search(req,page_code).group(2)
companylist = list()
companylist.append(company)
except:
companylist = []
# 公司规模
req = '(<span class="Compply_right_span_c"><i class="Compply_right_icon Compply_right_icon_rs"></i>)(\d*-\d*)人(?=</span>)'
try:
size = re.search(req,page_code).group(2)
sizelist = list()
sizelist.append(size)
except:
sizelist = []
# 发布日期
req = '(<span class="job_details_topright_data_time">)(\d*-\d*-\d*)(?= 更新</span>)'
publish = re.search(req,page_code).group(2)
publishlist = list()
publishlist.append(publish)
三、数据存取
存为JSON文件
将多个字段列表进行打包,并放入一个list中
return list(zip(joblist,salarylist,degreelist,detaillist,companylist,sizelist,publishlist))
调用json.dumps()方法,将传入数据转化为json格式数据
datalist = []
for data in datas:
for d in data:
datalist.append(
{'岗位名称': d[0], '薪资': d[1], '学历': d[2], '岗位要求': d[3], '公司名称': d[4], '公司规模': d[5],
'发布日期': d[6]})
使用with…as语句,将转换后的数据写入json文件中
with open(filename,'w',encoding='utf-8') as file_object:
file_object.write(json.dumps(datalist,ensure_ascii=False))
最终的数据呈现如下(部分)
[
{
"岗位名称": "软件工程师",
"薪资": "5000-8000",
"学历": "不限学历",
"岗位要求": "岗位职责:<br>1、完成软件系统代码的实现,编写代码注释和开发文档;<br>2、辅助进行系统的功能定义,程序设计;<br>3、根据设计文档或需求说明完成代码编写,调试,测试和维护;<br>4、分析并解决软件开发过程中的问题;<br>5、配合项目经理完成相关任务目标。<br>任职资格:<br>1.熟悉PHP基本框架<br>2.对服务器维护有一定了解,能够熟练操作Linux<br>3.了解服务器基础配置,如OSS,CDN,申请SSL证书,管理维护域名,解析等<br>3、计算机或相关专业大专学历以上,18周岁以上(条件优秀者可放宽要求);<br>4、熟悉面向对象思想,精通编程,调试和相关技术;<br>5、熟悉应用服务器的安装、调试、配置及使用;<br>6、具备需求分析和系统设计能力,以及较强的逻辑分析和独立解决问题能力;<br>7、能熟练阅读中文、英文技术文档;富有团队精神,责任感和沟通能力.",
"公司名称": "贵州星乐互动科技有限公司",
"公司规模": "10-50",
"发布日期": "2019-05-23"
}
]
四、完整代码
# -*- coding:utf-8 -*-
# Created by ZhaoWen on 2020/10/15
import requests
import json
import re
import time
#测试代理ip是否可用
def ip_test(headers,proxies_ip):
url = 'http://httpbin.org/get'
headers={}
proxies = proxies_ip
rep = requests.get(url,headers,proxies,timeout=5)
if rep.status_code == 200:
return True
return False
def ip_get():
with open('89ip.json','r',encoding='utf-8') as file_object:
json_data = json.load(file_object)
ip = [data['IP地址'] + ':' + data['端口'] for data in json_data]
ip = [i for i in ip if ip_get(i)]
if ip != None:
return ip
raise Exception('没有可以使用的代理ip')
# 将页面写入文件
def save_file(style,filename,datas):
if style == 'txt':
data = str(datas)
filename = filename+'.'+'txt'
with open(filename,'w',encoding='utf-8') as file_object:
file_object.write(data)
if style == 'json':
filename = filename+'.'+style
datalist = []
for data in datas:
for d in data:
datalist.append(
{'岗位名称': d[0], '薪资': d[1], '学历': d[2], '岗位要求': d[3], '公司名称': d[4], '公司规模': d[5],
'发布日期': d[6]})
print('datalist->'+str(datalist))
with open(filename,'w',encoding='utf-8') as file_object:
file_object.write(json.dumps(datalist,ensure_ascii=False))
class china_job():
url = ''
headers={}
# pro = {
# 'http': '123.163.118.225:9999',
# 'http': '183.166.139.71:9999',
# }
def __init__(self,url,headers):
self.url = url
self.headers = headers
# self.proxies = proxies_ip
def run(self):
session = requests.session()
rep = session.get(url=url,headers=headers)
rep.encoding = rep.apparent_encoding
req = 'http://www.job.cn/job/\d*.html'
linklist = re.findall(req,rep.text)
infolist = list()
for link in linklist:
time.sleep(2)
print('开始爬取新页面~'+link)
infolist.append(self.get_info(link))
return infolist
def get_info(self,link):
page_code = self.get_page(link,headers)
# 职位名称
reg = '(?<=<h1 class="job_details_name">)\w*(?=</h1>)'
joblist = re.findall(reg,page_code)
# 薪资
req = '(?<=<span class="job_details_salary_n">¥)[\d\w]*[\|\-\/\~][\d\w]*(?=</span>)'
salarylist = re.findall(req,page_code)
# 要求学历
reg = '\w*学历(?=[[\r\n])'
degreelist = re.findall(reg,page_code)
# 职位描述
reg = '(<span class="job_details_describe_yq">婚况要求:不限</span>[\r\n\t]*)(.*)(?=[\r\n]*)'
detail = re.search(reg,page_code).group(2)
detaillist = list()
detaillist.append(detail)
# 公司名称
req = '(<a href="http://www.job.cn/company/c_show-id_\d*.html" target="_blank" class="">)(.*有限公司)</a>'
try:
company = re.search(req,page_code).group(2)
companylist = list()
companylist.append(company)
except:
companylist = []
# 公司规模
req = '(<span class="Compply_right_span_c"><i class="Compply_right_icon Compply_right_icon_rs"></i>)(\d*-\d*)人(?=</span>)'
try:
size = re.search(req,page_code).group(2)
sizelist = list()
sizelist.append(size)
except:
sizelist = []
# 发布日期
req = '(<span class="job_details_topright_data_time">)(\d*-\d*-\d*)(?= 更新</span>)'
publish = re.search(req,page_code).group(2)
publishlist = list()
publishlist.append(publish)
return list(zip(joblist,salarylist,degreelist,detaillist,companylist,sizelist,publishlist))
def get_content(self):
filename = 'china_job.txt'
with open(filename,'r',encoding='utf-8') as file_object:
content = file_object.read()
return content
# 获取页面信息
def get_page(self,url,headers):
#proxies=pro
session = requests.session()
rep = session.get(url=url,headers=headers)
rep.encoding = rep.apparent_encoding
if rep.status_code == 200:
return rep.text
return '没有抓到内容哦!'+str(rep.url)+str(rep.content)
if __name__ == '__main__':
# http://www.job.cn/job/1513320.html
# http://www.job.cn/job/list/35_47-0-0-0_0_0_0_0_0_0_0-0-0-0-1.html
url = 'http://www.job.cn/job/list/35_47-0-0-0_0_0_0_0_0_0_0-0-0-0-1.html'
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.job.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
'Cache - Control': 'no-cache'
}
result = china_job(url,headers).run()
save_file('json','china_job',result)
五、问题解决
中文写入json文件,输出格式为Unicode编码格式
如
{
"title": "\u3010Python\u3011\u8bf7\u95ee\u53bb\u54ea\u91cc\u4e0b\u8f7drequests\u7684\u5b89\u88c5\u5e93\uff1f",
"time": "2020-10-15 14:15:02",
"author": "\u541b_GV14Do",
"url": "https://www.lmonkey.com/ask/23002"
},
解决办法为
json.dumps(datalist,ensure_ascii=False)
在调用dumps()方法时,给参数ensure_ascii传入False参数
六、参考文档
使用Requests+正则表达式爬取89免费代理网站代理ip和端口信息,并保存为JSON格式