今天刚开始学selenium,就写了一个爬取拉勾网某个你想搜索职位的公司、职位名、薪酬、招聘要求和招聘链接
并且存储到同一个表格中的不同表单中(每搜索一次,存储在同一表格的新的表单中)
进入拉勾网的第一个页面没有隐藏起来,后面点击链接后的页面隐藏起来了(怕我老板发现我在浏览别的公司的信息)
刚学selenium,好多find方法还不熟练~~~
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options #隐藏页面用的
import openpyxl
import os
def save_job(company):
#打开拉勾网
driver = webdriver.Chrome()
driver.get('https://www.lagou.com/')
time.sleep(2)
#打开后会让你选地方
sz = driver.find_element_by_partial_link_text("深圳站")
sz.click()
time.sleep(2)
#一定要等待几秒,不然可能反应不过来
search = driver.find_element_by_id('search_input')
search.send_keys(company) #输入你想查询的岗位
button = driver.find_element_by_id('search_button')
button.click()
time.sleep(5) #也是要等几秒,不然可能没加载全,导致下面的元素找不到
jobs = driver.find_elements_by_class_name('position_link')
#将每次爬取的数据存在不同表格
path = 'C:/Users/Xpeng/Desktop/python_resource/爬取到的表格/岗位表格.xlsx'
if os.path.exists(path):
wb = openpyxl.load_workbook(path)
sheet = wb.active #进入当前表单
sheet = wb.create_sheet()
sheet.title = company
else:
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = company
works=[]
for job in jobs:
work = []
job_url = job.get_attribute('href')
chrome_options = Options()
chrome_options.add_argument('--headless')
newdriver = webdriver.Chrome(chrome_options=chrome_options)
#以上三行命令是进入无页面模式,即请求页面的时候不会弹出页面
print(job_url)
newdriver.get(job_url)
time.sleep(5)
#同上,这里最好也等待几秒,我试过3秒,还是会有些请求未加载完全导致报错
company2 = newdriver.find_element_by_class_name('job_company_content').find_element_by_tag_name('em').text
work.append(company2)
title = newdriver.find_element_by_class_name('job-name').get_attribute('title')
work.append(title)
salary = newdriver.find_element_by_class_name('job_request').find_element_by_tag_name('h3').text
work.append(salary)
detail = newdriver.find_element_by_class_name('job-detail').text
work.append(detail)
work.append(job_url)
works.append(work)
newdriver.close()
sheet['A1'] = '公司名'
sheet['B1'] = '职位名'
sheet['C1'] = '工资'
sheet['D1'] = '招聘细节'
sheet['E1'] = '招聘链接'
for i in works:
print(i)
sheet.append(i)
wb.save(path)
time.sleep(5)
driver.close()
company = input("你想搜索的职位:")
save_job(company)
执行结果: