第一步 先爬取拉钩首页数据 得到所有的岗位名称 和url
import requests
from pyquery import PyQuery
import json
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
def get_html(url, header=''):
'''
:param url: 要访问的地址
:param header: 设置请求头
:return: 返回响应数据
'''
response = requests.get(url, headers=header, timeout=3)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
else:
print('访问 {} 失败了。。 {}'.format(url, response.status_code))
return None
def parser_html(html):
'''
:param html: 需要处理的html
:return: 返回当前页面的数据
'''
doc = PyQuery(html)
title = doc('title')
print(title)
all_a = doc('div.mainNavs a').items()
data = {}
for a in all_a:
data[a.text()] = a.attr('href')
print(type(json.dumps(data,)))
return json.dumps(data,ensure_ascii=False)
def save_data(data, path=''):
'''
:param data: 要保存的数据
:param path: 保存的路径
:return:
'''
with open(path, 'a', encoding='utf-8') as f:
f.write(data)
def main(url):
html = get_html(url, header=header)
data = parser_html(html)
save_data(data, 'lagou_links.txt')
if __name__ == '__main__':
main('https://www.lagou.com/')
第二步 根据首页岗位和url 得到相应岗位的招聘总数量
import requests
from pyquery import PyQuery
import json
header = {
'accept': 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'referer': 'https://www.lagou.com',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': "cors",
'sec-fetch-site': 'same-origin'
}
def get_html(url, header=''):
'''
:param url: 要访问的地址
:param header: 设置请求头
:return: 返回响应数据
'''
response = requests.get(url, headers=header, timeout=3)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
else:
print('访问 {} 失败了。。 {}'.format(url, response.status_code))
return None
data = {}
with open('lagou_links.txt','r',encoding='utf-8') as f:
all_url = f.read()
urls = json.loads(all_url,encoding='utf-8')
for name,url in urls.items():
html = get_html(url)
doc = PyQuery(html)
num = doc('span.span.totalNum').text()
data[name] = num
print(name,num)
data = json.dumps(data)
with open('lagou-data.txt','w',encoding='utf-8')as f:
f.write(data)
第三步 根据招聘岗位名称和数量 得到所有的招聘数据
import requests
import csv, json, time
from multiprocessing import Pool
header = {
'accept': 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'origin': 'https://www.lagou.com',
'referer': 'https://www.lagou.com/jobs/list_python%20?labelWords=&fromSearch=true&suginput=',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': "cors",
'sec-fetch-site': 'same-origin'
}
with open('lagou-data.txt', 'r', )as f:
links = f.read()
links = json.loads(links)
print(type(links), links)
def main(name, num, city='北京'):
start_url = f'https://www.lagou.com/jobs/list_{name}%20?labelWords=&fromSearch=true&suginput='
url = f'https://www.lagou.com/jobs/positionAjax.json?city={city}&needAddtionalResult=false'
for page in range(1, int(num) + 1):
session = requests.Session()
session.get(start_url, headers=header)
data = {
'first': 'true',
'pn': page,
'kd': name,
}
info = session.post(url, headers=header, cookies=session.cookies, data=data)
result = (info.json()['content']['positionResult']['result'])
with open('拉钩data.csv', 'a', encoding='utf-8', newline='') as f:
for i in result:
info = []
info.append(i['positionName'])
info.append(i['companyFullName'])
info.append(i['companyShortName'])
info.append(i['companySize'])
info.append(i['industryField'])
info.append(i['financeStage'])
info.append(','.join(i['companyLabelList']))
info.append(i['firstType'])
info.append(','.join(i['positionLables']))
info.append('-' + i['createTime'])
info.append(i['city'])
info.append(i['district'])
info.append(i['salary'])
info.append(i['workYear'])
if i['linestaion']:
info.append(i['linestaion'])
else:
info.append('空')
ow = csv.writer(f)
ow.writerow(info)
if __name__ == '__main__':
t1 = time.time()
pool = Pool(3)
for name, num in links.items():
pool.apply_async(main, args=(name, num,))
pool.close()
pool.join()
t2 = time.time()
print(t2 - t1)
print('全部完成')