from selenium import webdriver
import time
from selenium.webdriver.common.by import By
import random
import pandas as pd
def Chrome():
chrome_driver = './chromedriver.exe'
option = webdriver.ChromeOptions() # 配置Chrome驱动
# option.add_argument("--headless") # 无界面启动
option.add_experimental_option('useAutomationExtension', False) # 禁用Chrome的自动化拓展程序
option.add_experimental_option('excludeSwitches', ['enable-automation']) # 确保浏览器不会因为启用自动化模式而出现不必要的错误或异常。
option.add_argument("--disable-blink-features=AutomationControlled") # 禁用由自动化测试或脚本控制的 Blink 功能。
bs = webdriver.Chrome(chrome_options=option, executable_path=chrome_driver)
# webdriver防屏蔽,不加这个就会出现滑动失败
bs.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
})
return bs
qwer=[]
def PQ(bs):
for i in range(1, 11):
bs.get(url=hhh + "page=%d" % i)
num = random.randint(10, 15)
print("当前停止时间", num)
time.sleep(num)
bs.page_source
print("当前正在采集的是第%d页数据" % i)
if i == 1:
for j in range(1, 31):
name = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[2]/ul/li[%d]/div[1]/a/div[1]/span[1]' % j)
dz = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[2]/ul/li[%d]/div[1]/a/div[1]/span[2]/span' % j)
xz = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[2]/ul/li[%d]/div[1]/a/div[2]/span[1]' % j)
gs = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[2]/ul/li[%d]/div[1]/div/div[2]/h3/a' % j)
fl = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[2]/ul/li[%d]/div[2]/div' % j)
jy = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[2]/ul/li[%d]/div[1]/a/div[2]/ul/li[1]' % j)
time.sleep(1)
print(name.text, dz.text, xz.text, gs.text, fl.text, jy.text)
www=[name.text, dz.text, xz.text, gs.text, fl.text, jy.text]
qwer.append(www)
down_load(qwer)
else:
for j in range(1, 31):
name = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul/li[%d]/div[1]/a/div[1]/span[1]' % j)
dz = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul/li[%d]/div[1]/a/div[1]/span[2]/span' % j)
xz = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul/li[%d]/div[1]/a/div[2]/span[1]' % j)
gs = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul/li[%d]/div[1]/div/div[2]/h3/a' % j)
fl = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul/li[%d]/div[2]/div' % j)
jy = bs.find_element(by=By.XPATH,
value='//*[@id="wrap"]/div[2]/div[2]/div/div[1]/div[1]/ul/li[%d]/div[1]/a/div[2]/ul/li[1]' % j)
time.sleep(1)
print(name.text, dz.text, xz.text, gs.text, fl.text, jy.text)
print(name.text, dz.text, xz.text, gs.text, fl.text, jy.text)
www = [name.text, dz.text, xz.text, gs.text, fl.text, jy.text]
qwer.append(www)
down_load(qwer)
time.sleep(2)
def down_load(data):
# 定义列名
columns = ['岗位名称', '公司地址', '薪资',"公司名称","福利待遇","所需经验"]
# 使用pandas包将列表和列名整合到数据框架中
df = pd.DataFrame(data, columns=columns)
# 将数据框架保存为CSV文件
df.to_csv('Boss直聘.csv', index=False,mode='a')
if __name__=="__main__":
try:
inputname = input("请输入你需要爬取的岗位:")
hhh = f'https://www.zhipin.com/web/geek/job?query={inputname}&city=101280100&'
bs=Chrome()
PQ(bs)
except Exception as e:
print("程序发生异常",e)
finally:
bs.quit()
2023年7月中旬拿Boss直聘的招聘信息
于 2023-07-14 11:21:58 首次发布