前言
爬取步骤
1.引入库
代码如下(示例):
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import time
2.代码如下
base_url = 'https://zhaopin.baidu.com/api/qzasync?'
headers = {
'Host':'zhaopin.baidu.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
def get_page(page):
params = {
'query':'销售代表',
'city':'重庆',
'is_adq':'1',
'pcmod':'1',
'token': '==gxgG7pa6K1Ut1ZlZGmypZlWu5ZVZobVapldKJZm5ma',
'pn':str(page),
'rn':'20'
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
time.sleep(2)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error',e.args)
def parse_page(json):
if json:
items = json.get('data').get('disp_data')
for item in items:
job = {}
job['companydescription'] = item.get('companydescription')
job['city'] = item.get('city')
job['requirements'] = item.get('requirements')
job['company'] = item.get('company')
job['ori_salary'] = item.get('ori_salary')
yield job
if __name__ == '__main__':
k = 0
for i in range(0,100, 20):
json = get_page(i)
for j in parse_page(json):
print('success')