# 导入所需要的库 import time import json import warnings from selenium import webdriver from sqlalchemy import create_engine from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # 运行时terminal里面会出现好多警告,剔除警告 warnings.filterwarnings( 'ignore' ) class Download(): def __init__( self , url, year, path, chrome, username, password, elements): self .year = year self .url = url self .path = path self .chrome = chrome self .username = username self .password = password self .elements = elements # 浏览器设置 def web_sets( self ): self .options = webdriver.ChromeOptions() # 因为我使用的是谷歌浏览器 self .c_service = Service(f '{self.chrome}' ) self .c_service.command_line_args() # 设置后端服务器开始,因为会在后台产生好多服务,为了后面的关闭 self .c_service.start() # 提供默认下载地址 self .prefs = { 'download.default_directory' : f '{self.path}' } self .options.add_experimental_option( 'prefs' , self .prefs) # 设置忽略安全证书所带来的错误 self .options.add_argument( 'ignore-certificate-errors' ) # 一些小的设置 self .options.add_experimental_option( 'excludeSwitches' , [ "enable-automation" ]) self .options.add_argument( '--np-sanbox' ) self .options.add_argument( '--disable-dev-shm-usage' ) # 加属性避免bug self .options.add_argument( 'disable-gpu' ) # 添加无头模式 self .options.add_argument( 'headless' ) self .br = webdriver.Chrome(f '{self.chrome}' , chrome_options = self .options) self .br.implicitly_wait( 3 ) def loginPage( self ): """ 因为我是将所有元素保存在json文件里面,这样就不需要因为find_element而占用好多列 也为代码节省地方 这里需要强调的时find_element(By.XPATH)是最新selenium的使用方法,之前的使用会报错 """ self .br.get( self .url) time.sleep( 4 ) self .br.find_element(By.XPATH, f '{self.elements[keys[2]]}' ).send_keys( self .username) time.sleep( 2 ) self .br.find_element(By.XPATH, f '{self.elements[keys[3]]}' ).send_keys( self .password) time.sleep( 2 ) self .br.find_element(By.XPATH, f '{self.elements[keys[4]]}' ).click() time.sleep( 2 ) # 设置跳转到最后页面 def skipPage( self , url): self .br.get(url) time.sleep( 2 ) # 下载文件 def download_excel( self ): # 获取所有ul下面的li标签个数 ul2 = self .br.find_element(By.XPATH, f '{self.elements[keys[6]]}' ) # 获取li标签数目 lis2 = ul2.find_elements(By.XPATH, 'li' ) time.sleep( 1 ) # 循环li标签 for j in range ( len (lis2)): # 因为li的elements都是从1开始,python列表是从0开始,所以要+1 j + = 1 # 获取li标签的text name = self .br.find_element(By.XPATH, f '{self.elements[keys[7]][1]}' % j).get_attribute( 'title' ) print (f 'li标签name: {name}' ) if '日报' in name: print (f '第二遍过滤name: {name}' ) li_test = self .br.find_element(By.XPATH, f '{self.elements[keys[8]]}' % j) self .br.execute_script( 'arguments[0].click();' ,li_test) time.sleep( 0.5 ) self .br.find_element(By.XPATH, f '{self.elements[keys[9]]}' ).click() time.sleep( 0.5 ) li_test2 = self .br.find_element(By.XPATH, f '{self.elements[keys[8]]}' % j) time.sleep( 1 ) # 设置点击覆盖,以防止报错 # 因为一直要模拟点击选择文件,然后进行下载文件,防止点击覆盖 self .br.execute_script( "arguments[0].click();" , li_test2) time.sleep( 8 ) time.sleep( 10 ) time.sleep( 12 ) # 退出浏览器,推出后台服务 # c_service.stop()对应之前的c_service.stop() self .br.quit(); self .c_service.stop() |