初步代码
# -*- encoding: utf-8 -*-
from crawl.WebRequest import *
from crawl.mysqldb import SQL
import time, json, random, math, requests, logging, hashlib
# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='zhilian.log', level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)
# 获取请求头
logging.info('begin to get web request header')
# 需要爬取的职位
positions = ['大数据']
# 需要爬取的城市:北京、上海、深圳、广州、成都、杭州、武汉
# city_ids = ['530', '538', '765', '763', '801', '653', '736']
city_ids = ['801']
# 工作经验:无经验、1年以下、1-3年、3-5年、5-10年、10年以上
work_exps = ['0000', '0001', '0103', '0305', '0510', '1099']
# 请求头
header = header()
# 获取代理IP
proxy_list = get_home_proxy()
def main():
logging.info('begin to sending request')
sql = SQL()
latest_jobNums = sql.get_latest_jobNum('zhilian_update')
for city_id in city_ids:
for position in positions:
for work_exp in work_exps:
base_url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId={cityId}&salary=0,0' \
'&workExperience={workExp}&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&sortType=publish' \
'&kw={position}&kt=3&=0&_v=' + getParam()[0] + "&x-zp-page-request-id=" + getParam()[1]