1、初学爬虫,在写爬取拉勾网职位信息程序时,遇到报错如下:
2、查找资料后发现,在使用response.post发送带中文的json数据时,就出现如题所示错误,是因为编码问题:
'referer':referer.encode("utf-8").decode("latin1")
附带下这个爬虫程序(初学--我也是看着别人敲得):
import requests
class Config:
kd = '数据分析'
referer = 'https://www.lagou.com/jobs/list_数据分析?labelWords=&fromSearch=true&suginput='
headers = {
'Accept':'application/json,text/javascript,*/*;q=0.01',
'referer':referer.encode("utf-8").decode("latin1"),
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4315.4 Safari/537.36'
}
class Spider:
def __init__(self,kd=Config.kd):
self.kd = kd
self.url = Config.referer
self.api = 'https://www.lagou.com/jobs/positionAjax.json'
# 必须先请求referer网址
self.sess = requests.session()
self.sess.get(self.url,headers=Config.headers)
def get_position(self,pn):
data = {
'first':'true',
'pn':str(pn),
'kd':self.kd
}
# 向API发起POST请求
r = self.sess.post(self.api,headers=Config.headers,data=data)
# 直接.json()解析数据
return r.json()['content']['positionResult']['result']
def engine(self,total_pn):
for pn in range(1,total_pn + 1):
results = self.get_position(pn)
for pos in results:
print(pos['positionName'],pos['companyShortName'],pos['workYear'],pos['salary'])
if __name__ == '__main__':
lagou = Spider()
lagou.engine(2)
爬取到的结果示例: