本代码主要实现从招聘网站爬取所有招聘某个特定岗位的公司信息,如公司名,岗位,公司规模,招聘详情页信息,并写入csv文件。
难点是获取请求的url地址和参数。
在智联主页搜索销售岗位,Chrome浏览器打开网页,进入检查模式,可以看到如下请求,从Preview可以看出,应该为岗位详情页的请求地址,
https://fe-api.zhaopin.com/c/i/jobs/position-detail-new?at=740576d47fab4c5a80ec842d6a6f1e09&rt=5ddff5a625b84b8d88517073ed291bb0&number=CC281784280J40109517108&_v=0.79169777
除去参数,该请求可以简化为
https://fe-api.zhaopin.com/c/i/jobs/position-detail-new?number=CC281784280J40109517108
它的格式为'https://fe-api.zhaopin.com/c/i/jobs/position-detail-new?number=' 后面加上岗位的代码,如'CC281784280J40109517108'。 这样,我们从搜索结果中获取岗位代码后,就可以构造获取详情页的请求地址,注意,不要直接使用这个格式的地址:https://jobs.zhaopin.com/CCL1282463400J40058899815.htm
以下为完整代码:
其中爬取搜索结果页用了selenium,因为需要翻页,爬取详情页用requests库。
import time
import csv
import requests
from selenium import webdriver
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
if __name__=='__main__':
positions = []
while (True):
position = input('请输入要搜索的岗位名称,如python,数据分析等,按回车完成。可以输入多次,"n"或"N结束输入:')
if position == 'n' or position == 'N':
break
elif position=='':
pass
else:
positions.append(position)
print('您要搜索的是 %s 岗位' % positions)
print('开始登录!请稍候! 可能需要手动滑块验证或者微信扫码')
time1=time.time()
browser1 = webdriver.Firefox()
url='https://www.zhaopin.com/hangzhou/'
browser1.get(url)
time.sleep(2)
# try:
# browser1.find_element_by_xpath('/html/body/div[6]/div/div/button').click()
# except:
# browser1.find_element_by_xpath('/html/body/div[5]/div/div/button').click()
# print(browser1.current_window_handle)
time.sleep(8) #微信扫码登录
browser1.find_element_by_xpath('/html/body/div/div[1]/div/div[2]/div/div/div[2]/button').click() # 搜索后会弹出新窗口
first_handle = browser1.current_window_handle
# # print(first_handle)
handles = browser1.window_handles
time.sleep(2)
# browser1.switch_to.window(handles[-1]) #跳到新窗口,或者用下面一段代码
for i in handles:
if i == first_handle:
browser1.close() # 关闭当前窗口
else:
browser1.switch_to.window(i)
time.sleep(2)
browser1.refresh()
try:
browser1.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/button[2]').click()
except:
pass
browser1.find_element_by_xpath('/html/body/div/div[2]/div/div[1]/div/div[1]/input').clear()
browser1.find_element_by_xpath('/html/body/div/div[2]/div/div[1]/div/div[1]/input').send_keys(positions[0])
browser1.find_element_by_xpath('/html/body/div/div[2]/div/div[1]/div/button').click()
time.sleep(3)
with open('zhilianlist.csv','a+',newline='',encoding='utf-8-sig') as f: #utf-8 还是会有乱码
writer=csv.writer(f)
companyqty=0
print('this is a break这里是页面分隔,开始打印列表')
company=browser1.find_elements_by_xpath('/html/body/div/div[3]/div[2]/div[2]/div/div/a/div[1]/div[2]/span') #公司名称列表
job_name=browser1.find_elements_by_xpath('/html/body/div[1]/div[3]/div[2]/div[2]/div/div/a/div[1]/div[1]/span[1]') #岗位名称
job_detail_page=browser1.find_elements_by_xpath('/html/body/div[1]/div[3]/div[2]/div[2]/div/div/a') #岗位详情页链接
company_size=browser1.find_elements_by_xpath('/html/body/div[1]/div[3]/div[2]/div[2]/div/div/a/div[2]/div[2]/span[2]') #公司规模
for i,j,k,l in zip(company,job_name,job_detail_page,company_size):
print(i.text,end=' ') # 或者 print(i.get_attribute('title'))
print(j.text,end=' ')
# print(type(i))
# print(type(i.text))
# print(k.get_attribute('href')).split('/')[0]
print(l.text,end=' ')
print(k.get_attribute('href').split('?')[0],end=' ')
#以下获取招聘岗位详情,如人数,发布时间,发布人
job_detail='https://fe-api.zhaopin.com/c/i/jobs/position-detail-new?number=' + k.get_attribute('href').split('?')[0].split('/')[-1].split('.')[0]
# 构造详情页面访问函数 类似https://fe-api.zhaopin.com/c/i/jobs/position-detail-new?number=CC297305210J40157193509
json=requests.get(job_detail).json() #可能需要加入header等信息
dict=json['data']
# print(dict['detailedCompany']['companySize']) #公司规模
print('发布时间:',dict['detailedPosition']['positionPublishTime'],end=' ')
print('招聘人数:',dict['detailedPosition']['recruitNumber'],end=' ')
print('发布人:',dict['detailedPosition']['staff']['staffName'],end=' ')
print('发布人岗位:',dict['detailedPosition']['staff']['hrJob'])
companyqty+=1
#将信息写入CSV文件
writer.writerows([[i.get_attribute('title'),j.text,l.text,k.get_attribute('href').split('?')[0],\
dict['detailedPosition']['recruitNumber'],dict['detailedPosition']['positionPublishTime'],dict['detailedPosition']['staff']['staffName'],dict['detailedPosition']['staff']['hrJob']]])
for clicktime in range(1,34): #循环翻页,这里最好用代码自动判断页数:
js='window.scrollTo(0,document.body.scrollHeight);'
browser1.execute_script(js)
time.sleep(2)
browser1.find_element_by_xpath('/html/body/div/div[3]/div[2]/div[2]/div/div[31]/div[2]/div/button[2]').click()
time.sleep(3)
print('this is a break这里是页面分隔,开始打印列表')
company=browser1.find_elements_by_xpath('/html/body/div/div[3]/div[2]/div[2]/div/div/a/div[1]/div[2]/span')
job_name=browser1.find_elements_by_xpath('/html/body/div[1]/div[3]/div[2]/div[2]/div/div/a/div[1]/div[1]/span[1]')
job_detail_page=browser1.find_elements_by_xpath('/html/body/div[1]/div[3]/div[2]/div[2]/div/div/a')
company_size=browser1.find_elements_by_xpath('/html/body/div[1]/div[3]/div[2]/div[2]/div/div/a/div[2]/div[2]/span[2]')
for i,j,k,l in zip(company,job_name,job_detail_page,company_size):
companyqty+=1
print(i.text,end=' ') # 或者 print(i.get_attribute('title'))
print(j.text,end=' ')
print(l.text,end=' ')
print(k.get_attribute('href').split('?')[0])
#以下获取招聘岗位详情,如人数,发布时间,发布人
job_detail='https://fe-api.zhaopin.com/c/i/jobs/position-detail-new?number=' + k.get_attribute('href').split('?')[0].split('/')[-1].split('.')[0]
# 构造详情页面访问函数 类似https://fe-api.zhaopin.com/c/i/jobs/position-detail-new?number=CC297305210J40157193509
json=requests.get(job_detail).json() #可能需要加入header等信息
dict=json['data']
print('发布时间:',dict['detailedPosition']['positionPublishTime'],end=' ')
print('招聘人数:',dict['detailedPosition']['recruitNumber'],end=' ')
print('发布人:',dict['detailedPosition']['staff']['staffName'],end=' ')
print('发布人岗位:',dict['detailedPosition']['staff']['hrJob'])
companyqty+=1
writer.writerows([[i.get_attribute('title'),j.text,l.text,k.get_attribute('href').split('?')[0],\
dict['detailedPosition']['recruitNumber'],dict['detailedPosition']['positionPublishTime'],dict['detailedPosition']['staff']['staffName'],dict['detailedPosition']['staff']['hrJob']]])
print("qty of jobs is:",companyqty) #打印岗位数目
print('time used:',time.time()-time1) #爬虫时间