网上提供的例子https://segmentfault.com/a/1190000005778518,运行失败,所以重新修改了一下。
python脚本文件
# encoding=utf-8
import requests
from openpyxl import Workbook
def generateHeaders():
headersBrower = '''
Host: www.lagou.com
User-Agent: Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate
Cookie: JSESSIONID=1B300941D70EA7394D108F3D520C9D30; _ga=GA1.2.1060979661.1492406212; _gat=1; user_trace_token=20170417131652-14fddec2c32d4928a65e6aea50608be6; LGSID=20170417131652-10e21050-232d-11e7-8711-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20170417131657-13e05b5c-232d-11e7-a516-5254005c3644; LGUID=20170417131652-10e211c1-232d-11e7-8711-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1492406215; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1492406217; index_location_city=%E5%B9%BF%E5%B7%9E
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Cache-Control: max-age=0
'''
headersMap = {}
for item in headersBrower.splitlines():
item = str.strip(item)
if item and ":" in item:
(key, value) = item.split(":", 1)
headersMap[str.strip(key)] = str.strip(value)
return headersMap
headers = generateHeaders()
def get_json(url, page, lang_name):
data = {'first': 'true', 'pn': page, 'kd': lang_name}
json = requests.get(url, data, headers = headers).json()
list_con = json['content']['positionResult']['result']
info_list = []
for i in list_con:
info = []
info.append(i['companyShortName'])
info.append(i['companyFullName'])
info.append(i['salary'])
info.append(i['city'])
info.append(i['education'])
info_list.append(info)
return info_list
def main():
lang_name = "java"
page = 1
url = 'http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
info_result = []
while page < 31:
info = get_json(url, page, lang_name)
info_result = info_result + info
page += 1
wb = Workbook()
ws1 = wb.active
ws1.title = lang_name
for row in info_result:
ws1.append(row)
wb.save('/tmp/职位信息.xlsx')
if __name__ == '__main__':
main()
api调用失败处理
如果出现api调用失败的问题,先用浏览器(例如firefox)测试http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false,能否访问。
如果浏览器能访问成功,则复制浏览器的Request Headers,替换脚本文件中的headersBrower