python100day - 28day- 招聘网站数据的爬取
import requests
from re import findall
import json
import threadpool
from queue import Queue
import csv
from threading import Thread
def get_total_page(job):
"""
获取指定岗位的搜索结果的总页数
:param job: 岗位名称
:return: 总页数/(None)
"""
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
url = f'https://search.51job.com/list/090200,000000,0000,00,9,99,{job},2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
response = requests.get(url, headers=header)
if response.status_code == 200:
json_data = get_json_data(response.text)
return int(json_data['total_page'])
else:
print('请求失败!', response)
def get_json_data(data):
"""
获取请求结束中保存岗位信息的json数据
:param data: 网页源代码
:return: json转换成python的字典
"""
re_str = r'(?s)window.__SEARCH_RESULT__ = (.+?)</script>'
json_str = findall(re_str, data)[0]
return json.loads(json_str)
def get_one_page_data(info: str):
"""
获取单页数据
:param info:
:return:
"""
job, page = info.split('-')
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
url = f'https://search.51job.com/list/090200,000000,0000,00,9,99,{job},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
response = requests.get(url, headers=header)
if response.status_code == 200:
json_data = get_json_data(response.text)
analysis_data(json_data)
else:
print('请求失败!', response)
def analysis_data(data):
print(data)
search_result = data['engine_search_result']
for x in search_result:
post = {}
post['details_url'] = x.get('job_href')
post['job_name'] = x.get('job_name')
post['company_href'] = x.get('company_href')
post['company_name'] = x.get('company_name')
post['providesalary'] = x.get('providesalary_text')
post['workarea_text'] = x.get('workarea_text')
post['updatedate'] = x.get('updatedate')
post['companytype_text'] = x.get('companytype_text')
post['workyear'] = x.get('workyear')
post['jobwelf'] = x.get('jobwelf_list')
queue.put(post)
def get_all_data(job):
print('开始获取数据!')
total_page = get_total_page(job)
pool = threadpool.ThreadPool(20)
tasks = threadpool.makeRequests(get_one_page_data, [f'{job}-{x}' for x in range(1, total_page+1)])
for task in tasks:
pool.putRequest(task)
pool.wait()
print('数据获取完成')
queue.put('end')
def save_data(job):
print('==============开始保存数据==================')
first_job = queue.get()
with open(f'files/{job}.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, list(first_job.keys()))
writer.writeheader()
writer.writerow(first_job)
while True:
post = queue.get()
if post == 'end':
return
writer.writerow(post)
print('保存成功!')
if __name__ == '__main__':
queue = Queue()
t = Thread(target=save_data, args=('前端',))
t.start()
get_all_data('前端')