网址(首页):滑动验证页面
如图1所示:
查看所需数据是否存在于网页源代码中,发现需要的数据存在用javascript所写的脚本中,不可直接用原始URL获取数据,如下图所示:
如何寻找所需数据?
步骤一:如图所示,在network中清除,重新加载网页
步骤二:如下图所示,查看返回的是否是json文件(用在线json解析,看格式“集合”) ,发现里面包含了json,即我们所需要的数据
步骤三:如下图所示,用复制的链接打开网页,即为目标URL
获取含有所需数据的源代码
import json
import urllib.request,urllib.error #制定URL,获取网页数据
import re
def main():
url="https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
#askURL(url) #将爬取到的内容保存至result.txt文件中
result=open('result.txt','r',encoding='utf-8')
#正则表达式
data=re.findall(r"\"engine_jds\":(.+?),\"jobid_count\"",str(result.readlines()))
print(data[0])
jsonObj=json.loads(data[0])
for item in jsonObj:
print(item['job_name']+':'+item['company_name'])
#间隔时间爬取
#代理
def askURL(url):
head={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
request=urllib.request.Request(url,headers=head)
html=""
try:
response=urllib.request.urlopen(request)
html=response.read().decode("gbk")
print(html)
except urllib.error.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__=="__main__":
main()
print(data[0])的部分运行结果(中间的省略了)
[{"type":"engine_jds","jt":"0_0","tags":[],"ad_track":"","jobid":"138284904","coid":"6831150","effect":"1","is_special_job":"","job_href":"https:\\/\\/jobs.51job.com\\/beijing-cyq\\/138284904.html?s=sou_sou_soulb&t=0_0","job_name":"python工程师","job_title":"python工程师","company_href":"https:\\/\\/jobs.51job.com\\/all\\/coAmJQPARkBToDZwZgAWQ.html","company_name":"深圳市览众科技股份有限公司","providesalary_text":"0.6-1万\\/月","workarea":"010500","workarea_text":"北京-朝阳区","updatedate":"03-15","iscommunicate":"","companytype_text":"民营公司","degreefrom":"5","workyear":"3","issuedate":"2022-03-15 09:15:41","isFromXyz":"","isIntern":"","jobwelf":"五险一金 年终奖金 定期体检 员工旅游 交通补贴 餐饮补贴 专业培训 绩效奖金 股票期权 弹性工作","jobwelf_list":["五险一金","年终奖金","定期体检","员工旅游","交通补贴","餐饮补贴","专业培训","绩效奖金","股票期权","弹性工作"],"isdiffcity":"","attribute_text":["北京-朝阳区","1年经验","大专","招2人"],"companysize_text":"50-150人","companyind_text":"计算机软件","adid":""},{"type":"engine_jds","jt":"0_0","tags":[],"ad_track":"","jobid":"131628693","coid":"5866100","effect":"1","is_special_job":"","job_href":"https:\\/\\/jobs.51job.com\\/dongguan\\/131628693.html?s=sou_sou_soulb&t=0_0","job_name":"Python开发工程师(初级)","job_title":"Python开发工程师(初级)","company_href":"https:\\/\\/jobs.51job.com\\/all\\/co5866100.html","company_name":"广州市正成信息科技有限公司","providesalary_text":"5-8千\\/月","workarea":"030800","workarea_text":"东莞","updatedate":"03-15","iscommunicate":"","companytype_text":"民营公司","degreefrom":"6","workyear":"3","issuedate":"2022-03-15 09:11:19","isFromXyz":"","isIntern":"","jobwelf":"五险一金 补充医疗保险 员工旅游 专业培训 绩效奖金 弹性工作","jobwelf_list":["五险一金","补充医疗保险","员工旅游","专业培训","绩效奖金","弹性工作"],"isdiffcity":"","attribute_text":["东莞","1年经验","本科","招若干人"],"companysize_text":"50-150人","companyind_text":"电子技术\\/半导体\\/集成电路","adid":""},{"type":"engine_jds","jt":"0_0","tags":{"type":"engine_jds","jt":"0_0","tags":[],"ad_track":"","jobid":"138822037","coid":"4970051","effect":"1","is_special_job":"","job_href":"https:\\/\\/jobs.51job.com\\/hangzhou-yhq\\/138822037.html?s=sou_sou_soulb&t=0_0","job_name":"后端Python开发工程师","job_title":"后端Python开发工程师","company_href":"https:\\/\\/jobs.51job.com\\/all\\/co4970051.html","company_name":"浙江禾川科技股份有限公司","providesalary_text":"1.4-2.2万\\/月","workarea":"080207","workarea_text":"杭州-余杭区","updatedate":"03-15","iscommunicate":"","companytype_text":"民营公司","degreefrom":"6","workyear":"4","issuedate":"2022-03-15 04:00:38","isFromXyz":"","isIntern":"","jobwelf":"","jobwelf_list":[""],"isdiffcity":"","attribute_text":["杭州-余杭区","2年经验","本科","招2人"],"companysize_text":"500-1000人","companyind_text":"仪器仪表\\/工业自动化","adid":""},{"type":"engine_jds","jt":"0_0","tags":[],"ad_track":"","jobid":"138817451","coid":"2486667","effect":"1","is_special_job":"","job_href":"https:\\/\\/jobs.51job.com\\/tianjin-xqq\\/138817451.html?s=sou_sou_soulb&t=0_0","job_name":"Python开发工程师","job_title":"Python开发工程师","company_href":"https:\\/\\/jobs.51job.com\\/all\\/co2486667.html","company_name":"北京国遥新天地信息技术股份有限公司","providesalary_text":"1-1.5万\\/月","workarea":"050800","workarea_text":"天津-西青区","updatedate":"03-15","iscommunicate":"","companytype_text":"民营公司","degreefrom":"5","workyear":"4","issuedate":"2022-03-15 04:00:38","isFromXyz":"","isIntern":"","jobwelf":"五险一金 年终奖金 员工旅游 定期体检","jobwelf_list":["五险一金","年终奖金","员工旅游","定期体检"],"isdiffcity":"","attribute_text":["天津-西青区","2年经验","大专","招1人"],"companysize_text":"500-1000人","companyind_text":"计算机软件","adid":""},(Python\\/GUI)","job_title":"软件工程师desal\/co2793941.html","company_na资","degreefrom":"6","workyear":"4","issuedate":"2022-03-15 17:37:42","isFromXyz":"","isIntern":"","jobwelf":"五险一金 交通补贴 餐饮补贴 绩效奖金 年终奖金 定期体检 商业保险 生日礼金 过节费","jobwelf_list":["五险一金","交通补贴","餐饮补贴","绩效奖金","年终奖金","定期体检","商业保险","生日礼金","过节费"],"isdiffcity":"","attribute_text":["成都-高新区","2年经验","本科","招1人"],"companysize_text":"1000-5000人","companyind_text":"通信\\/电信\\/网络设备","adid":""}]
由于这样看json文件不方便,故使用网页json解析(网址为:JSON在线 | JSON解析格式化—SO JSON在线工具)
方式一:
方式二:点击json解析右边的json在线解析,更加清晰明了
在复制json时运用小技巧:
ctrl+A 全选
ctrl+C 复制
ctrl+V 粘贴
代码最终运行结果