本次案例实现selenium控制本地Chrome浏览器(不是selenium新打开的无cookie浏览器)
抓取BOSS直聘相关职位信息,并写入数据库中,配置好环境可直接运行
代码如下:
"""Readme:
create time: 2021-08-01
1. 请使用pip安装好相应扩展库. lxml、win32api、selenium、pymysql,并配置好ChromeDriver等相关环境
2. 本次使用selenium控制的浏览器为本地Chrome,使用类时请将Chrome安装路径传入参数中
3. 以便更好的抓取相关信息,请提前在Chrome浏览器登录BOSS直聘,如果没有登录会受到反爬限制
4. 执行写入数据库操作时记得提前将 BOSS库和jobs_info表创建好
"""
import os
import time
import random
import pymysql
import win32api
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
class BOSS(object):
def __init__(self, ChromePath, Keyword, QueryQuantity=30):
"""
:param ChromePath: chrome安装路径
:param Keyword: 搜索职位关键字
:param QueryQuantity: 获取职位的数量,默认获取30条
"""
self.Url = "https://www.zhipin.com/"
self.ChromePath = ChromePath
self.Keyword = Keyword
self.jobs_quantity = QueryQuantity
self.Browser = None
self.xpath = {}
self.job_info = {}
self.jobs_list = []
def start_browser(self):
"""启动本地浏览器并打开页面"""
# 判断当前Chrome是否在运行
def is_exe_running(exe="chrome.exe"):
result = os.popen(f'''tasklist | findstr "{exe}" ''')
return exe in result.read()
# 关闭当前Chrome
def close_exe_program(exe="chrome.exe"):
if is_exe_running(exe):
os.popen(f"""taskkill -F /im {exe}""")
return True
return False
# 启动Chrome
def start_program(path, params=""):
win32api.ShellExecute(0, 'open', path, params, os.path.split(path)[0], 1)
# 启用Chrome
def start_debugging_chrome(url=""):
if close_exe_program():
time.sleep(1)
path = self.ChromePath
assert path is not None, "获请传入chrome.exe 绝对路径"
if not path.endswith('chrome.exe'):
path = path + '\\chrome.exe'
start_program(path, f"--remote-debugging-port=9222 {url}")
start_debugging_chrome(url=self.Url)
option = Options()
option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
self.Browser = webdriver.Chrome(options=option)
self.Browser.maximize_window()
self.Browser.switch_to.window(self.Browser.window_handles[0])
try:
WebDriverWait(driver=self.Browser, timeout=10).until(lambda d: d.find_element_by_id("main"))
except Exception:
print("打开页面失败,请检查网络是否正常后重新运行此程序(如果网络正常尝试改变此方法中的element元素或将try语句注释)")
self.close_browser()
exit(-1)
def close_browser(self):
"""关闭浏览器"""
self.Browser.quit()
def wait_element_loaded(self, xpath: str, timeout=10, close_brows