python爬虫，获取拉勾网职位信息，修改网上旧版不能用的问题-CSDN博客

网上提供的例子https://segmentfault.com/a/1190000005778518，运行失败，所以重新修改了一下。

python脚本文件

# encoding=utf-8

import requests
from openpyxl import Workbook

def generateHeaders():
    headersBrower = '''
Host: www.lagou.com
User-Agent: Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate
Cookie: JSESSIONID=1B300941D70EA7394D108F3D520C9D30; _ga=GA1.2.1060979661.1492406212; _gat=1; user_trace_token=20170417131652-14fddec2c32d4928a65e6aea50608be6; LGSID=20170417131652-10e21050-232d-11e7-8711-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20170417131657-13e05b5c-232d-11e7-a516-5254005c3644; LGUID=20170417131652-10e211c1-232d-11e7-8711-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1492406215; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1492406217; index_location_city=%E5%B9%BF%E5%B7%9E
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Cache-Control: max-age=0
    '''

    headersMap = {}
    for item in headersBrower.splitlines():
        item = str.strip(item)
        if item and ":" in item:
            (key, value) = item.split(":", 1)
            headersMap[str.strip(key)] = str.strip(value)

    return headersMap

headers = generateHeaders()

def get_json(url, page, lang_name):
    data = {'first': 'true', 'pn': page, 'kd': lang_name}
    json = requests.get(url, data, headers = headers).json()
    list_con = json['content']['positionResult']['result']
    info_list = []
    for i in list_con:
        info = []
        info.append(i['companyShortName'])
        info.append(i['companyFullName'])
        info.append(i['salary'])
        info.append(i['city'])
        info.append(i['education'])
        info_list.append(info)
    return info_list


def main():
    lang_name = "java"
    page = 1
    url = 'http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    info_result = []
    while page < 31:
        info = get_json(url, page, lang_name)
        info_result = info_result + info
        page += 1
    wb = Workbook()
    ws1 = wb.active
    ws1.title = lang_name
    for row in info_result:
        ws1.append(row)
    wb.save('/tmp/职位信息.xlsx')

if __name__ == '__main__':
    main()

api调用失败处理

如果出现api调用失败的问题，先用浏览器（例如firefox）测试http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false，能否访问。

如果浏览器能访问成功，则复制浏览器的Request Headers，替换脚本文件中的headersBrower

转载于:https://my.oschina.net/u/1263909/blog/880871