初步代码
# -*- encoding: utf-8 -*-
from com.lagou.crawl.WebRequest import *
from com.lagou.crawl.mysqldb import SQL
import time, json, random, math, requests, logging
# 格式化输出日志
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(filename='lagou.log', level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
crawl_positions = ['大数据'] # 需要爬取的职位
crawl_citys = ['成都'] # 需要爬取的城市
work_exps = ['3年及以下', '3-5年', '5-10年', '10年以上'] # 工作经验
proxy_list = get_home_proxy() # 获取代理IP
def index_page():
logging.info('begin to sending request')
for position in crawl_positions:
for city in crawl_citys:
for work_exp in work_exps:
crawl_url = 'https://www.lagou.com/jobs/positionAjax.json?gj={gj}&px=new&city={city}&needAddtionalResult=false'.format(
gj=work_exp, city=city)
referer_url = 'https://www.lagou.com/jobs/list_{position}?px=new&gj={gj}&city={city}'.format(
position=position, gj=work_exp, city=city)
ses = requests.session() # 获取session
# 获取对应header
header = header_lagou(position, city, work_exp)
# 更新header
ses.headers.update(header)
# 获取代理IP
proxy = random.choice(proxy_list)
try:
ses.get(referer_url)
response = ses.post(url=crawl_url