# -*- coding: utf-8 -*- import json import re from datetime import datetime from mf_utils.core import BaseInitCore from mf_utils.decorates import cls_catch_exception from mf_utils.logger import Logger from mf_utils.sql.mysql import MysqlHandle class CompanyPosition(BaseInitCore): def __init__(self): super(CompanyPosition, self).__init__() self.logger = Logger.file_logger() self.mysql_handle = MysqlHandle(host='127.0.0.1', user="root", passwd='mysql', db='ligang', port=3306, charset='utf8') # 智联 def get_zhi_lian_position_list(self, company_name, start=0, res_lst=None): try: url = 'https://fe-api.zhaopin.com/c/i/sou?' \ 'start={start}&pageSize=60&cityId=489' \ '&kw={company_name}' \ '&kt=2'.format(start=start, company_name=company_name) res = self.html_downloader.download(url) data_list = json.loads(res.text).get('data').get('results') total = int(json.loads(res.text).get('data').get('numFound')) current_page = int(re.findall('(?<=start=).*?(?=&)', res.url)[0]) / 60 + 1 self.logger.debug('current_page-%s' % current_page) for data in data_list: position_info = dict() position_info['site'] = 'ZHI_LIAN' position_info['city'] = data.get('city').get('display') position_info['jobName'] = data.get('jobName') res_lst.append(position_info) start = current_page * 60 if (current_page - 1) * 60 < total: self.get_zhi_lian_position_list(company_name, start=start, res_lst=res_lst) return res_lst except Exception as e: self.logger.exception(e) return res_lst # 51job def get_five_one_position_list(self, company_name, page=1, res_lst=None): try: url = 'https://search.51job.com/list/000000,' \ '000000,0000,00,9,99,{company_name}' \ ',1,{page}.html'.format(company_name=company_name, page=page) print url res = self.html_downloader.download(url) soups = self.html_parser.parser(res.content) current_page = int(soups.find('div', class_='p_in').find('li', class_='on').text) total_page = int(re.findall('\d+', soups.find('div', class_='p_in').find('span', class_='td').text)[0]) self.logger.debug('current_page-%s' % current_page) data_lst = soups.find('div', id='resultList').find_all('div', class_='el')[1:] for data in data_lst: position_info = dict() position_info['site'] = 'FIVE_ONE' position_info['jobName'] = data.find('a').get('title') position_info['jobId'] = data.find('input').get('value') try: city, exp, degree, desc = self.get_five_one_position_detail(job_id=position_info['jobId']) except Exception as e: self.logger.exception(e) continue position_info['city'] = city res_lst.append(position_info) if current_page < total_page: page += 1 self.get_five_one_position_list(company_name, page=page, res_lst=res_lst) return res_lst except Exception as e: self.logger.exception(e) return res_lst # 智联详情 @cls_catch_exception def get_zhi_lian_position_detail(self, job_id): url = 'https://jobs.zhaopin.com/{}.htm'.format(job_id) headers = { 'Cookie': 'ZP_OLD_FLAG=false;' } res = self.html_downloader.download(url, headers=headers) self.logger.debug('get detail {}'.format(job_id)) soups = self.html_parser.parser(res.content) position_desc = soups.find('div', class_='pos-ul').text.strip() return position_desc # 51job详情 @cls_catch_exception def get_five_one_position_detail(self, job_id): url = 'https://jobs.51job.com/all/{}.html'.format(job_id) res = self.html_downloader.download(url) self.logger.debug('get detail {}'.format(job_id)) soups = self.html_parser.gbk_parser(res.content) city, exp, degree = soups.find('p', class_='msg ltype').text.strip().replace(u' ', '').split('|')[:3] if u'招' in degree: degree = '' position_desc_lst = soups.find('div', class_='bmsg job_msg inbox').find_all('p', recursive=False) position_desc = ''.join(map(lambda x: x.text.strip(), position_desc_lst)).replace('\n', ' ') return city, exp, degree, position_desc def main(): cp = CompanyPosition() cp.logger.info('start') positions = [ 'ZHI_LIAN|北京金钢兔网络科技有限公司', 'ZHI_LIAN|北京鑫融嘉业信息科技有限公司', ] for task in positions: site, company_name = task.split("|") cp.logger.info('start_task:{}|{}'.format(site, company_name)) if site == "ZHI_LIAN": res_lst = cp.get_zhi_lian_position_list(company_name, res_lst=[]) elif site == "FIVE_ONE": res_lst = cp.get_five_one_position_list(company_name, res_lst=[]) else: res_lst = [] sql = 'insert into ligang.lg_position_1(company_name,city, position, source,publis_time) values(%s,%s,%s,%s,%s)' for res in res_lst: print res data = (company_name, res.get('city'), res.get('jobName'),site, datetime.now()) # data = (company_name, res.get('city'), res.get('jobName'), site, datetime.now()) cp.mysql_handle.save(sql=sql, data=data) print json.dumps(res_lst, ensure_ascii=False, indent=4) cp.logger.info( 'match position : {} ,TOTAL' '{}'.format(site, len(res_lst)) ) if __name__ == "__main__": main()