一个用来爬取拉勾网上职位信息的Python脚本~
import random
import time
import requests
from openpyxl import Workbook
import json
import logging;logging.basicConfig(level=logging.INFO)
import re
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
def get_json(url,page,lang_name):
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
url_start = 'https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput='
s = requests.Session()
s.get(url_start, headers=headers, timeout=3) # 请求首页获取cookies
cookie = s.cookies # 为此次获取的cookies
data = {'first': 'false', 'pn': page, 'kd': lang_name}
response = s.post(url, data=data, headers=headers, cookies=cookie, timeout=3) # 获取此次文本
response.encoding = response.apparent_encoding
text = json.loads(response.text)
list_con = text["content"]["positionResult"]["result"]
info_list = []
for i in list_con:
info = []
info.append(i.get('companyShortName', '无')) # 公司名
info.append(i.get('companyFullName', '无'))
info.append(i.get('industryField', '无')) # 行业领域
info.append(i.get('companySize', '无')) # 公司规模
info.append(i.get('salary', '无')) # 薪资
info.append(i.get('city', '无'))
info.append(i.get('education', '无')) # 学历
info.append(i.get('district', '无')) # district
info.append(i.get('positionName', '无')) # positionName
info.append(i.get('workYear', '无')) # workYear
info.append(i.get('positionAdvantage', '无')) # positionAdvantage
info.append(i.get('linestaion', '无')) # linestaion
info_list.append(info)
return info_list # 返回列表
def main():
lang_name = '高级销售运营'
wb = Workbook() # 打开 excel 工作簿
for i in ['北京']: # 可以配置多个城市
page = 1
ws1 = wb.active
ws1.title = lang_name
url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
while page < 31:
logging.info('start page: %s' % page)
info = get_json(url, page, lang_name)
logging.info('end page: %s' % page)
page += 1
time.sleep(random.randint(1, 5))
for row in info:
newRow = []
for msg in row:
newRow.append(ILLEGAL_CHARACTERS_RE.sub(r'', str(msg))) #防止一些特殊字符在写入Excel时报错 用正则替换掉
ws1.append(newRow)
wb.save('{}职位信息.xlsx'.format(lang_name))
if __name__ == '__main__':
main()