1.1 从第一页中获取最大页码数
frist_tree = etree.HTML(response)
max_page = int(re.findall('\d+', frist_tree.xpath('//a[text()="末页"]/@title')[0])[0])
print(max_page) # 最大页码
1.2 从第一页中获取翻页需要的data
因为翻页使用的是post提交,所以使用xpath自动获取整个页面的from 表单中的input 带有name属性的value值,特别是带有秘钥的两个input的value
如图:
# 从第一页中获取翻页需要的字段
input_dict = dict()
input_name_lists = frist_tree.xpath('//form[@name="aspnetForm"]//input[@name]')
for input_name in input_name_lists:
name = input_name.xpath('./@name')[0]
value = input_name.xpath('./@value')[0] if len(input_name.xpath('./@value')) > 0 else ''
input_dict[name] = value
print(input_dict)
1.3 从获取当前页面中详情页的链接并跳转到详情页
def handel_listpage(self, listpage_response):
# 获取这一页中每条信息的tr
tr_lists = listpage_response.xpath('//th[text()="职位名称"]/../../tr')[1:]
# 遍历每个tr,从中提取详情页的url,并访问获取响应
for tr in tr_lists:
detail_url = tr.xpath('.//a/@href')[0]
detail_url = '{}/{}'.format(self.base_url, detail_url)
# 获取详情页
detail_response = self.request_s.get(detail_url, headers=self.headers, timeout=60).content.decode()
item = self.get_detail_info(detail_response=detail_response) # 将响应交给获取信息的函数
self.all_info_lists.append(item)
print('get_page_ok!')
1.4 从获取详情页中的信息存到字典中
预览图
def get_detail_info(self, detail_response):
item = dict()
detail_tree = etree.HTML(detail_response)
# 获取招聘信息的各个信息
item['job_name'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_name"]/text()')[0]
item['job_type'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_type_name"]/text()')[0]
item['pubuli_date'] = detail_tree.xpath('//span[@id="ctl00_Content_lblupdate_date"]/text()')[0]
item['organ_name'] = detail_tree.xpath('//span[@id="ctl00_Content_lbldept_name"]/text()')[0]
item['request_education'] = detail_tree.xpath('//span[@id="ctl00_Content_education_name"]/text()')[0]
item['request_major'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_major"]/text()')[0]
item['request_job_years'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_years"]/text()')[0]
item['request_count'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_count"]/text()')[0]
item['request_detail'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_promulgator"]/text()')[0]
print('get_detail_page_info_ok!')
return item
1.5 将数据保存在json 文件中
with open('file_json\\fox_rec_info.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(self.all_info_lists, ensure_ascii=False, indent=2))
1.6 完整代码
import requests
import json
from lxml import etree
import re
class FoxRec(object):
def __init__(self, base_url):
self.base_url = base_url
self.start_url = '{}/{}'.format(base_url, 'Job_listInfo.aspx') # 列表页的url
self.headers = self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
self.request_s = requests.session()
self.all_info_lists = list()
def get_detail_info(self, detail_response):
item = dict()
detail_tree = etree.HTML(detail_response)
# 获取招聘信息的各个信息
item['job_name'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_name"]/text()')[0]
item['job_type'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_type_name"]/text()')[0]
item['pubuli_date'] = detail_tree.xpath('//span[@id="ctl00_Content_lblupdate_date"]/text()')[0]
item['organ_name'] = detail_tree.xpath('//span[@id="ctl00_Content_lbldept_name"]/text()')[0]
item['request_education'] = detail_tree.xpath('//span[@id="ctl00_Content_education_name"]/text()')[0]
item['request_major'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_major"]/text()')[0]
item['request_job_years'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_years"]/text()')[0]
item['request_count'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_count"]/text()')[0]
item['request_detail'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_promulgator"]/text()')[0]
print('get_detail_page_info_ok!')
return item
def handel_listpage(self, listpage_response):
# 获取这一页中每条信息的tr
tr_lists = listpage_response.xpath('//th[text()="职位名称"]/../../tr')[1:]
# 遍历每个tr,从中提取详情页的url,并访问获取响应
for tr in tr_lists:
detail_url = tr.xpath('.//a/@href')[0]
detail_url = '{}/{}'.format(self.base_url, detail_url)
# 获取详情页
detail_response = self.request_s.get(detail_url, headers=self.headers, timeout=60).content.decode()
item = self.get_detail_info(detail_response=detail_response) # 将响应交给获取信息的函数
self.all_info_lists.append(item)
print('get_page_ok!')
def handel_info(self, response):
all_info = list()
frist_tree = etree.HTML(response)
max_page = int(re.findall('\d+', frist_tree.xpath('//a[text()="末页"]/@title')[0])[0])
print(max_page) # 最大页码
# 获取第一页中详情页的数据
self.handel_listpage(listpage_response=frist_tree)
# 从第一页中获取翻页需要的字段
input_dict = dict()
input_name_lists = frist_tree.xpath('//form[@name="aspnetForm"]//input[@name]')
for input_name in input_name_lists:
name = input_name.xpath('./@name')[0]
value = input_name.xpath('./@value')[0] if len(input_name.xpath('./@value')) > 0 else ''
input_dict[name] = value
print(input_dict)
# 翻页
for i in range(2, max_page):
# input_dict['__EVENTARGUMENT'] = 2
input_dict['ctl00$Content$pager_input'] = i
other_page_reponse = self.request_s.post(self.start_url, data=input_dict, headers=self.headers).content.decode()
other_page_tree = etree.HTML(other_page_reponse)
self.handel_listpage(listpage_response=other_page_tree)
def main(self):
response = self.request_s.get(self.start_url, headers=self.headers, timeout=60).content.decode()
self.handel_info(response=response)
with open('file_json\\fox_rec_info.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(self.all_info_lists, ensure_ascii=False, indent=2))
if __name__ == '__main__':
fox = FoxRec('http://hr.foxconn.com/R_Society')
fox.main()