import csv
import random
import time
from lxml import etree
from selenium import webdriver
#实现规避检测
from selenium.webdriver import ChromeOptions
#实现无可视化界面的
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
#去除浏览器识别,去除Chrome正在受到自动检测软件的控制
chrome_options = ChromeOptions()
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('detach', True)
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
options = Options()
options.add_argument('--headless') # 设置为无头
options.add_argument('--disable-gpu') # 设置没有使用gpu
options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
#实例化一个浏览器对象(传入浏览器的驱动程序)
web = webdriver.Chrome(chrome_options=chrome_options, options=options)
with open('./stealth.min.js') as f:
js = f.read()
web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
def modify_detail(job_details):
job_details = job_details[1:]
job_details = [x.strip() for x in job_details if x.strip() != '']
# print('----',job_details)
# print('mylist列表长度',len(job_details))
new1_list = []
new2_list = []
for k in range(len(job_details)):
# print('---',job_details[k])
if '任职资格' in job_details[k]:
new2_list = job_details[k+1:]
new1_list = job_details[:k]
elif '职位要求' in job_details[k]:
new2_list = job_details[k+1:]
new1_list = job_details[:k]
elif 'Note' in job_details[k]:
new2_list = job_details[k+1:]
new1_list = job_details[:k]
elif '任职要求' in job_details[k]:
new2_list = job_details[k+1:]
new1_list = job_details[:k]
elif '岗位任职条件' in job_details[k]:
new2_list = job_details[k+1:]
new1_list = job_details[:k]
elif '岗位要求' in job_details[k]:
new2_list = job_details[k+1:]
new1_list = job_details[:k]
elif '招聘条件' in job_details[k]:
new2_list = job_details[k+1:]
new1_list = job_details[:k]
elif '应聘要求' in job_details[k]:
new2_list = job_details[k+1:]
new1_list = job_details[:k]
# else:
# new1_list = job_details
print(new1_list)
print(new2_list)
return new1_list, new2_list
#利用xpath和css选择器提取数据
f = open('金融招聘.csv', mode='a', encoding='utf-8',newline='')
csv_write = csv.DictWriter(f,fieldnames=[
'岗位名称',
'薪资',
'公司名称',
'公司规模',
'所属行业',
'工作地点',
'工作经验',
'学历要求',
'关键词',
'发布日期',
'招聘详情',
'岗位职责',
'任职资格'
# '上班地址',
# '联系方式'
])
csv_write.writeheader() #写入表头
#找到职位框,输入搜索的内容
job_names = ['金融科技','区块链金融','量化投资','金融大数据分析','金融风险分析','金融产品用户运营','金融用户体验']
for j in job_names:
# 定位输入框并查找相关职位
# 发起请求
web.get("https://www.51job.com/")
time.sleep(5) # 防止加载缓慢,休眠2秒
web.find_element(By.XPATH, '//*[@id="kwdselectid"]').click()
web.find_element(By.XPATH, '//*[@id="kwdselectid"]').clear()
# 输入招聘岗位名称
web.find_element(By.XPATH, '//*[@id="kwdselectid"]').send_keys(j)
web.find_element(By.XPATH, '/html/body/div[3]/div/div[1]/div/button').click()#点击搜索
time.sleep(10)
for page in range(1,50):
print(f'==============正在爬取{page}页信息==================')
time.sleep(10)
web.find_element(By.XPATH, '//*[@id="jump_page"]').click()
time.sleep(random.randint(10, 30) * 0.1)
web.find_element(By.XPATH, '//*[@id="jump_page"]').clear()
time.sleep(random.randint(10, 40) * 0.1)
web.find_element(By.XPATH, '//*[@id="jump_page"]').send_keys(page)
time.sleep(random.randint(10, 30) * 0.1)
web.find_element(By.XPATH, '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[3]/div/div/span[3]').click()
#定位招聘页面所有招聘公司
time.sleep(5)
jobData = web.find_elements(By.XPATH, '//*[@id="app"]/div/div[2]/div/div/div[2]/div/div[2]/div/div[2]/div[1]/div')
#print(jobData)
#详情页面工作职责和任职资格划分
for job in jobData:
jobName = job.find_element(By.CLASS_NAME, 'jname.at').text #职位名称
time.sleep(random.randint(5, 15) * 0.1)
jobSalary = job.find_element(By.CLASS_NAME, 'sal').text #工作薪资
time.sleep(random.randint(5, 15) * 0.1)
jobCompany = job.find_element(By.CLASS_NAME, 'cname.at').text #公司名称
time.sleep(random.randint(5, 15) * 0.1)
company_type_size = job.find_element(By.CLASS_NAME, 'dc.at').text #公司规模
time.sleep(random.randint(5, 15) * 0.1)
company_status = job.find_element(By.CLASS_NAME, 'int.at').text #所属行业
time.sleep(random.randint(5, 15) * 0.1)
address_experience_education = job.find_element(By.CLASS_NAME, 'd.at').text #地点_经验_学历
print(address_experience_education)
length = len(address_experience_education.split('|'))
if length == 3:
address = address_experience_education.split('|')[0] #工作地点
experience = address_experience_education.split('|')[1] #工作经验
edu = address_experience_education.split('|')[2] #学历要求
else:
address = address_experience_education.split('|')[0] # 工作地点
experience = '无需经验'
edu = '学历不限'
time.sleep(random.randint(5, 15) * 0.1)
try:
job_welf = job.find_element(By.CLASS_NAME, 'tags').get_attribute('title') #关键词
except:
job_welf = '无数据'
time.sleep(random.randint(5, 15) * 0.1)
update_date = job.find_element(By.CLASS_NAME, 'time').text #发布日期
time.sleep(random.randint(5, 15) * 0.1)
job_href = job.find_element(By.CLASS_NAME, 'el').get_attribute('href') #招聘详情
#print(job_href)
#获取当前窗口句柄A
handle = web.current_window_handle
try:
job.click()# 打开招聘详情页
except:
continue #如果当前招聘详情页打不开, 跳过当前招聘公司
#获得当前所有窗口句柄(窗口A、B)
handles = web.window_handles
time.sleep(5) # 防止加载缓慢,休眠5秒
for newhandle in handles:
#筛选新打开的窗口B
if newhandle != handle:
web.switch_to.window(newhandle)#切换到新打开的窗口B
d_html = web.page_source
html = etree.HTML(d_html)
# dt = html.xpath('//div[@class=bmsg.job_msg.inbox]')
job_details = html.xpath('/html/body/div[2]/div[2]/div[3]/div[1]/div/text()')
##详情页面工作职责和任职资格划分
job_res, job_zige = modify_detail(job_details)
web.close()
dit = {
"岗位名称": jobName,
"公司名称": jobCompany,
"薪资": jobSalary,
"公司规模": company_type_size,
"所属行业": company_status,
"工作地点": address,
"工作经验":experience,
"学历要求":edu,
"关键词": job_welf,
"发布日期": update_date,
"招聘详情": job_href,
"岗位职责": job_res,
"任职资格": job_zige
}
print(f'正在爬取{jobCompany}公司')
print(jobName, jobSalary, jobCompany, company_type_size, company_status, address, experience, edu,
job_welf,update_date,job_href,job_res,job_zige)
csv_write.writerow(dit)
web.switch_to.window(handles[0])
time.sleep(10)
Selenium爬取前程无忧51job招聘信息
最新推荐文章于 2023-06-02 06:39:05 发布