代码: import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import csv
import os
import time
import json
class spider(object):
def __init__(self,type,page):
self.type = type #岗位关键字
self.page = page #page是当前的页码数
self.spiderUrl = 'https://www.zhipin.com/web/geek/job?query=%s&city=101281100&page=%s'
def startBrower(self):
service = Service('C:/Users/23653/PycharmProjects/chromedriver.exe')
options = webdriver.ChromeOptions()
#使用浏览器复用来防反爬虫,使用前,不允许任何浏览器在运行
options.add_experimental_option('debuggerAddress','localhost:9222')
#options.add_experimental_option('excludeSwitches', ['enable-automation'])
brower = webdriver.Chrome(service=service, options=options)
return brower
def main(self, page): #page是要爬取的总页码数
#if self.page > page:return
brower = self.startBrower()
print("正在爬取页面路径:" + self.spiderUrl % (self.type, self.page))
brower.get(self.spiderUrl % (self.type, self.page))
time.sleep(15) #等待页面时间15秒
job_list = brower.find_elements(by=By.XPATH, value= '//ul[@class="job-list-box"]/li')
for index, job in enumerate(job_list):
try:
jobData = []
print("正在爬取第%d个数据" % (index + 1))
# 岗位名字
title = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-title')]/span[@class='job-name']").text
# 省份地址
addresses = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-title')]/span[@class='job-area-wrapper']/span").text.split('·')
address = addresses[0]
# 行政区
if len(addresses) != 1:
dist = addresses[1]
else:
dist = ''
# 岗位
type = self.type
tag_list = job.find_elements(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]/ul[@class='tag-list']/li")
if len(tag_list) == 2:
# 学历
educational = tag_list[1].text
# 工作经验
workExperience = tag_list[0].text
else:
# 学历
educational = tag_list[2].text
# 工作经验
workExperience = tag_list[1].text
#hr名字
hrName = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]/div[@class='info-public']").text
#hr职位
hrWork = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]/div[@class='info-public']/em").text
#工作标签
workTag = job.find_elements(by=By.XPATH,value="./div[contains(@class,'job-card-footer')]/ul[@class='tag-list']/li")
workTag = json.dumps(list(map(lambda x: x.text, workTag)))
#是否是实习生
pratice = 0
salaries = job.find_element(by=By.XPATH,value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]/span[@class='salary']").text
if salaries.find('K') != -1:
salaries = salaries.split('·')
if len(salaries) == 1:
# 薪资
salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K','').split('-')))
#年底多少月薪
salaryMonth = '0薪'
else:
# 薪资
salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
# 年底多少月薪
salaryMonth = salaries[1]
else:
# 薪资
salary = list(map(lambda x: int(x), salaries.replace('元/天', '').split('-')))
# 年底多少月薪
salaryMonth = '0薪'
pratice = 1
#公司名字
companyTitle = job.find_element(by=By.XPATH,value=".//div[@class='job-card-right']/div[contains(@class,'company-info')]/h3/a").text
#公司头像
companyAvatar = job.find_element(by=By.XPATH,value=".//div[@class='job-card-right']/div[contains(@class,'company-logo')]/a/img").get_attribute("src")
companyInfos = job.find_elements(by=By.XPATH,value=".//div[@class='job-card-right']/div[contains(@class,'company-info')]/ul[@class='company-tag-list']/li")
if len(companyInfos) == 3:
#公司性质
companyNature = companyInfos[0].text
#公司状态
companyStatus = companyInfos[1].text
#公司人数
companyPeoples = companyInfos[2].text
if companyPeoples != '1000人以上':
companyPeople = list(map(lambda x: int(x),companyInfos[2].text.replace('人','').split('-')))
else:
companyPeople = [0,10000]
else:
# 公司性质
companyNature = companyInfos[0].text
# 公司状态
companyStatus = '未融资'
# 公司人数
companyPeoples = companyInfos[1].text
if companyPeoples != '1000人以上':
companyPeople = list(map(lambda x: int(x), companyInfos[1].text.replace('人', '').split('-')))
else:
companyPeople = [0, 10000]
#公司福利
companyTags = job.find_element(by=By.XPATH,value='./div[contains(@class,"job-card-footer")]/div[@class="info-desc"]').text
if not companyTags:
companyTags = '无'
else:
companyTags = json.dumps(companyTags.split(','))
#岗位详情页链接
detailUrl = job.find_element(by=By.XPATH,value='.//a[@class="job-card-left"]').get_attribute('href')
#公司详情页链接
companyUrl = job.find_element(by=By.XPATH,value='.//div[@class="job-card-right"]/div[@class="company-info"]/h3/a').get_attribute('href')
jobData.append(title)
jobData.append(address)
jobData.append(type)
jobData.append(educational)
jobData.append(workExperience)
jobData.append(workTag)
jobData.append(salary)
jobData.append(salaryMonth)
jobData.append(companyTags)
jobData.append(hrWork)
jobData.append(hrName)
jobData.append(pratice)
jobData.append(companyTitle)
jobData.append(companyAvatar)
jobData.append(companyNature)
jobData.append(companyStatus)
jobData.append(companyPeople)
jobData.append(detailUrl)
jobData.append(companyUrl)
jobData.append(dist)
self.save_to_csv(jobData)
except:
pass
self.page += 1
self.main(page)
#数据清洗
def clear_csv(self):
df = pd.read_csv('./temp.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['salaryMonth'] = df['salaryMonth'].map(lambda x: x.replace('薪',''))
print("总数据为%d"%df.shape[0])
return df.values
def save_to_csv(selfself,rowData):
with open('./temp.csv','a',newline='',encoding='utf-8') as wf:
writer = csv.writer(wf)
writer.writerow(rowData)
def init(self):
if not os.path.exists('./temp.csv'):
with open('./temp.csv','a',newline='',encoding = 'utf-8') as wf:
writer = csv.writer(wf)
writer.writerow(["title","address","type","educational","workExperience","workTag","salary","salaryMonth",
"companyTags","hrWork","hrName","pratice","companyTitle","companyAvatar","companyNature",
"companyStatus","companyPeople","detailUrl","companyUrl","dist"])
if __name__ == "__main__":
spiderObj = spider('java', 1)# 职业 与 初始页面
spiderObj.init()
spiderObj.main(10)#爬取11页,初始页1+爬取页10
输出结果:\a3\python.exe D:\数据可视化\main.py
Traceback (most recent call last):
File "D:\a3\lib\site-packages\selenium\webdriver\common\driver_finder.py", line 64, in _binary_paths
raise ValueError(f"The path is not a valid file: {path}")
ValueError: The path is not a valid file: C:/Users/23653/PycharmProjects/chromedriver.exe
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "D:\数据可视化\main.py", line 210, in <module>
spiderObj.main(10)#爬取11页,初始页1+爬取页10
File "D:\数据可视化\main.py", line 28, in main
brower = self.startBrower()
File "D:\数据可视化\main.py", line 22, in startBrower
brower = webdriver.Chrome(service=service, options=options)
File "D:\a3\lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 47, in __init__
super().__init__(
File "D:\a3\lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 53, in __init__
if finder.get_browser_path():
File "D:\a3\lib\site-packages\selenium\webdriver\common\driver_finder.py", line 47, in get_browser_path
return self._binary_paths()["browser_path"]
File "D:\a3\lib\site-packages\selenium\webdriver\common\driver_finder.py", line 78, in _binary_paths
raise NoSuchDriverException(msg) from err
selenium.common.exceptions.NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location