思路:通过使用Selenium
模块来模拟浏览器操作,然后解析页面,获取岗位信息。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
if __name__ == '__main__':
option = webdriver.EdgeOptions()
# 设置无头模式,避免了打开浏览器页面
option.add_argument('headless')
# 这里使用了我之前登录过的浏览器
browser = webdriver.Edge(options=option)
browser.get('https://www.nowcoder.com/jobs/intern/center?recruitType=2&city=%E5%8C%97%E4%BA%AC&city=%E4%B8%8A%E6'
'%B5%B7&city=%E5%B9%BF%E5%B7%9E&city=%E6%B7%B1%E5%9C%B3&city=%E6%9D%AD%E5%B7%9E&city=%E5%8D%97%E4%BA'
'%AC&city=%E6%AD%A6%E6%B1%89&careerJob=11002')
while True:
WebDriverWait(browser, 10).until(
EC.presence_of_element_located(
(By.XPATH, '//div[@class="job-card-item"]'))
)
jobs = parse_per_page(browser) # 解析每个页面
write_to_excel(jobs) # 写入表格
btn_next = browser.find_element(By.XPATH, '//button[@class="btn-next"]') # 找到下一页的按钮
disabled = btn_next.get_attribute('disabled') # 判断还有没有下一页
if disabled == 'disabled':
break
else:
btn_next.click()
browser.quit() # 在爬取并保存数据后别忘了关闭浏览器
在爬取之前,建议在浏览器中先登录自己的牛客网账号,这样会在站点留下Cookie,从而在爬取的时候避免出现需要登录的情况。
可以在浏览器中先选择好自己要查找的岗位筛选条件,然后复制url,替换上面browser.get()
中的内容。
Job
类的定义:定义了一个岗位的各个信息,方便存储岗位信息。
class Job:
job_name = ""
job_url = ""
salary = ""
addr = ""
company = ""
def __init__(self, job_name, job_url, salary, addr, company):
self.job_name = job_name
self.job_url = job_url
self.salary = salary
self.addr = addr
self.company = company
def __str__(self):
return self.job_name+'\t'+self.salary+'\t'+self.addr+'\t'+self.company+'\t'+self.job_url
解析页面部分的代码:
def parse_per_page(browser):
job_list = browser.find_elements(By.XPATH, '//div[@class="job-card-item"]')
res_list = []
for card in job_list:
job_name = card.find_element(By.XPATH, './/span[@class="job-name"]')
job_url = card.find_element(By.XPATH, './/a[@class="job-message-boxs"]')
salary = card.find_element(By.XPATH, './/span[contains(@class,"job-salary")]')
addr = card.find_element(By.XPATH, './/div[contains(@class,"job-info-item")]')
company = card.find_element(By.XPATH, './/div[@class="company-name"]')
job = Job(job_name=job_name.text, job_url=job_url.get_attribute('href'), salary=salary.text, addr=addr.text, company=company.text)
res_list.append(job)
return res_list
写入Excel部分的代码:
def write_to_excel(job_list):
# 将Job实例的属性转换为字典列表
data = [
{
"Job Name": job.job_name,
"Salary": job.salary,
"Address": job.addr,
"Company": job.company,
"Job URL": job.job_url,
}
for job in job_list
]
# 创建DataFrame
df = pd.DataFrame(data)
# 追加数据到现有的Excel文件
# Provide the full path to the 'jobs.xlsx' file
with pd.ExcelWriter("./jobs.xlsx", mode="a", if_sheet_exists="replace") as writer:
df.to_excel(writer, index=False, sheet_name="Sheet1", header=not writer.sheets["Sheet1"])
结果展示: