数据分析岗位爬取 (添加excel文档)
import requests
from re import findall
from json import loads
import time
import os
import openpyxl
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
def get_one_page(page):
url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
response = requests.get(url, headers=headers)
if response.status_code == 200:
json_data = findall(r'window.__SEARCH_RESULT__\s*=\s*(\{.+?\})</script>', response.text)[0]
return loads(json_data)['engine_search_result']
else:
print('请求失败!')
def get_all_data():
all_data = []
page = 1
while True:
result = get_one_page(page)
if not result:
print('没有更多数据')
break
page += 1
save_page_data(result)
print(f'获取第{page}页数据成功!')
time.sleep(1)
def get_work_book():
if os.path.exists('files/招聘信息.xlsx'):
wb = openpyxl.load_workbook('files/招聘信息.xlsx')
else:
wb = openpyxl.Workbook()
names = wb.sheetnames
if '数据分析' in names:
sheet = wb['数据分析']
else:
sheet = wb.create_sheet('数据分析')
titles = ['岗位名称', '薪资', '公司名称', '公司性质', '公司地址', '要求', '福利']
for col in range(1, len(titles) + 1):
sheet.cell(1, col).value = titles[col - 1]
return wb, sheet
def save_page_data(data: list):
row = sheet.max_row + 1
for job in data:
job_info = [
job.get('job_name', ''),
job.get('providesalary_text', ''),
job.get('company_name', ''),
job.get('companytype_text', ''),
job.get('workarea_text', ''),
'/'.join(job.get('attribute_text', ['-', '-', '-', '-', '-'])),
job.get('jobwelf', '')
]
for col in range(1, len(job_info)+1):
sheet.cell(row, col).value = job_info[col-1]
print(job)
row += 1
wb.save('files/招聘信息.xlsx')
if __name__ == '__main__':
wb, sheet = get_work_book()
get_all_data()