#! /usr/bin/env python # -*- coding: utf-8 -*- """ @author: ligang @contact: a289237642@163.com @software: PyCharm IDEA @file: company_position_count.py @create at: 2018-09-04 10:40 """ import json, re, gevent from datetime import datetime from mf_utils.decorates import cls_catch_exception from mf_utils.logger import Logger from mf_utils.core import BaseInitCore from mf_utils.sql.redis_m import get_redis_client from mf_utils.sql.mysql import MysqlHandle from conf import settings from gevent import monkey monkey.patch_all() redis_client = get_redis_client( host=settings.REDIS_HOST, port=settings.REDIS_PORT, db=settings.REDIS_DB, password=settings.REDIS_PASSWORD ) class CompanyPosition(BaseInitCore): def __init__(self): super(CompanyPosition, self).__init__() self.logger = Logger.file_logger() self.mysql_handle = MysqlHandle(host='127.0.0.1', user="root", passwd='mysql', db='ligang', port=3306, charset='utf8') # 智联 def get_zhi_lian_position_list(self, company_name, start=0, res_lst=None): try: url = 'https://fe-api.zhaopin.com/c/i/sou?' \ 'start={start}&pageSize=60&cityId=489' \ '&kw={company_name}' \ '&kt=2'.format(start=start, company_name=company_name) print url res = self.html_downloader.download(url) data_lst = json.loads(res.text).get('data').get('results') total = int(json.loads(res.text).get('data').get('numFound')) current_page = int(re.findall('(?<=start=).*?(?=&)', res.url)[0]) / 60 + 1 self.logger.debug('current_page-%s' % current_page) for data in data_lst: position_info = dict() position_info['site'] = 'ZHI_LIAN' position_info['city'] = data.get('city').get('display') position_info['jobName'] = data.get('jobName') res_lst.append(position_info) start = current_page * 60 if (current_page - 1) * 60 < total: self.get_zhi_lian_position_list(company_name, start=start, res_lst=res_lst) return res_lst except Exception as e: self.logger.exception(e) return res_lst # 51job def get_five_one_position_list(self, company_name, page=1, res_lst=[]): try: url = 'https://search.51job.com/list/000000,' \ '000000,0000,00,9,99,{company_name}' \ ',1,{page}.html'.format(company_name=company_name, page=page) print url res = self.html_downloader.download(url) soups = self.html_parser.gbk_parser(res.content) current_page = int(soups.find('div', class_='p_in').find('li', class_='on').text) total_page = int(re.findall('\d+', soups.find('div', class_='p_in').find('span', class_='td').text)[0]) self.logger.debug('current_page-%s' % current_page) data_lst = soups.find('div', id='resultList').find_all('div', class_='el')[1:] for data in data_lst: position_info = dict() position_info['site'] = 'FIVE_ONE' position_info['jobName'] = data.find('a').get('title') position_info['jobId'] = data.find('input').get('value') try: city, exp, degree, desc = self.get_five_one_position_detail(job_id=position_info['jobId']) except Exception as e: self.logger.exception(e) continue position_info['city'] = city res_lst.append(position_info) if current_page < total_page: page += 1 self.get_five_one_position_list(company_name, page=page, res_lst=res_lst) return res_lst except Exception as e: self.logger.exception(e) return res_lst @cls_catch_exception def get_zhi_lian_position_detail(self, job_id): url = 'https://jobs.zhaopin.com/{}.htm'.format(job_id) headers = { 'Cookie': 'ZP_OLD_FLAG=false;' } res = self.html_downloader.download(url, headers) self.logger.debug('get detail {}'.format(job_id)) soups = self.html_parser.parser(res.content) position_desc = soups.find('div', class_='pos-ul').text.strip() return position_desc @cls_catch_exception def get_five_one_position_detail(self, job_id): url = 'https://jobs.51job.com/all/{}.html'.format(job_id) res = self.html_downloader.download(url) self.logger.debug('get detail {}'.format(job_id)) soups = self.html_parser.gbk_parser(res.content) city, exp, degree = soups.find('p', class_='msg ltype').text.strip().replace(u' ', '').split('|')[:3] if u'招' in degree: degree = '' position_desc_lst = soups.find('div', class_='bmsg job_msg inbox').find_all('p', recursive=False) position_desc = ''.join(map(lambda x: x.text.strip(), position_desc_lst)).replace('\n', ' ') return city, exp, degree, position_desc def main(): key = 'lg' cp = CompanyPosition() cp.logger.info('start company position search. redis_queue: {}'.format(key)) while True: try: task = redis_client.blpop(key)[1] print task site, company_name = task.split('|') cp.logger.info('start_task:{}|{}'.format(site, company_name)) if site == "ZHI_LIAN": res_lst = cp.get_zhi_lian_position_list(company_name, res_lst=[]) elif site == "FIVE_ONE": res_lst = cp.get_five_one_position_list(company_name, res_lst=[]) else: res_lst = [] sql = 'insert into ligang.lg_position_2(company_name,city, position, source,publis_time) values(%s,%s,%s,%s,%s)' for res in res_lst: data = (company_name, res.get('city'), res.get('jobName'), site, datetime.now()) cp.mysql_handle.save(sql, data=data) cp.logger.info('match position : {},TOTAL' '{}'.format(site, len(res_lst))) except Exception as e: cp.logger.exception(e) if __name__ == '__main__': gevent.joinall([gevent.spawn(main, ) for i in range(settings.COROUTINE_NUM)])
redis队列:
# encoding=utf-8 from company_position_count import redis_client positions = [ 'FIVE_ONE|天津津天连达贸易有限公司', 'FIVE_ONE|上海丽享贸易有限公司', 'FIVE_ONE|上海傲升国际贸易有限公司' ] for task in positions: redis_client.lpush('lg', task) print redis_client.llen('lg')
配置文件:
REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379 REDIS_PASSWORD = '' REDIS_DB = 0 COROUTINE_NUM = 5