【目标】
下载某网站 https://www.pcac.org.cn/eportal/ui?pageId=595055
拟获取列表中文档数据。
先获取总页数,如下图:
然后逐步翻页获取每个页面链接。
有些链接指向.pdf文档,可以直接下载;有些链接指向html文件,直接保存;有些有一个附件,直接下载;有些有多个附件,建立目录后保存。
【优化细节】
若文件已经下载过,则自动跳过;
若获取某个文件下载出错,则自动跳过进行下一个下载。
【下载附件】推荐用requests库,好处是有错误状态码,且不用判断文件下载时间,且可以重命名文件,故不用driver.get(url)下载文件。
【示范代码】
import os import time from urllib import parse import requests from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} def mk_dir(dir1): if not os.path.exists(dir1): os.mkdir(dir1) def file_exist(base_dir, pure_filename): if os.path.exists(os.path.join(base_dir, pure_filename)): return True return False base_dir = os.path.join(os.getcwd(), 'download') mk_dir(base_dir) def write_html_page(browser, filename): browser.switch_to.window(browser.window_handles[-1]) page = browser.page_source with open(filename, 'w', encoding='utf-8') as f: f.write(page) print('-' * 70) chrome_options = Options() chrome_options.add_argument("--headless") # 在后台运行 chrome_options.add_experimental_option("prefs", { "download.default_directory": base_dir, }) driver = webdriver.Chrome(options=chrome_options, service=Service('./chromedriver.exe')) base_url = 'https://www.pcac.org.cn/eportal/ui?pageId=595055' driver.get(base_url) time.sleep(1) total_page = driver.find_element(By.XPATH, '//div[@class="easysite-total-page fy-div"]/span[1]/b[2]').text # total_page=driver.find_element(By.XPATH,'//div[@class="easysite-total-page fy-div"]') # 85篇文章 当前页:1/6 123456 下一页 print(f'最大页数为:{total_page}') for i in range(1, int(total_page) + 1): url2 = f'https://www.pcac.org.cn/eportal/ui?pageId=595055¤tPage={i}&moduleId=b18e5af5da4f400fac1fd7ebf4a8bdd1&staticRequest=yes' driver.get(url2) li_list = driver.find_elements(By.XPATH, '//*[@id="b18e5af5da4f400fac1fd7ebf4a8bdd1"]/div[2]/ul/li') item_list = [] for li in li_list: tag_a = li.find_element(By.TAG_NAME, 'a') title1 = tag_a.text link1 = tag_a.get_attribute('href') item_list.append((title1, link1)) print('='*20+f'第{i}页,共有{len(item_list)}条记录'+'='*15) k = 1 for item in item_list: print('>>' + '-' * 60) print(f'[第{i}页-{k}]-正在获取{item[1]}') if '.pdf' in item[1]: print(f'【直接获取】pdf文件:{item[1]}') # 直接下载url文件 old_filename = item[1].split('/')[-1] old_filename = parse.unquote(old_filename) new_filename = item[0] + '.pdf' full_name = os.path.join(base_dir, new_filename) print(old_filename, new_filename) if file_exist(base_dir, new_filename): print(f'------跳过-->文件<{new_filename}>已下载...') k += 1 continue try: r = requests.get(item[1], headers=headers) print(r.status_code) with open(full_name, 'wb') as f: f.write(r.content) except: print(f'>>>except 下载发生错误,跳过{item[1]}') k += 1 else: # 先打开二级页面 driver.get(item[1]) content = driver.find_element(By.XPATH, '//div[@class="xl-main"]') attach_list = content.find_elements(By.TAG_NAME, 'a') if not attach_list: # 为html文件,无附件下载 print('【单个html】网页内容,直接下载') fname=os.path.join(base_dir, driver.title + ".html") if os.path.exists(fname): print(f'------跳过-->文件<{driver.title + ".html"}>已下载...') k += 1 continue write_html_page(driver, f'{fname}') k += 1 continue if len(attach_list) == 1: # 单个文件下载,保存在当期目录 a_tag = attach_list[0] item_dir = base_dir link3 = a_tag.get_attribute('href') old_filename = a_tag.get_attribute('href').split('/')[-1] old_filename = parse.unquote(old_filename) new_filename = a_tag.text new_full_filename = os.path.join(base_dir, new_filename) print('【单个文件】下载:', link3, old_filename, new_filename, new_full_filename) # if os.path.exists(os.path.join(item_dir, new_filename)): if file_exist(base_dir, new_filename): print(f'------跳过-->文件<{new_filename}>已下载...') k += 1 continue try: r = requests.get(link3, headers=headers) print(r.status_code) with open(new_full_filename, 'wb') as f: f.write(r.content) except: print(f'>>>except 下载发生错误,跳过{link3}') k += 1 else: # len(attach_list) > 1: 有多个附件需要下载,建立目录 item_dir = os.path.join(base_dir, item[0]) mk_dir(item_dir) print(f'【多附件下载】,附件个数:{len(attach_list)}') print('item_dir', item_dir) files = [] for a_tag in attach_list: # element = a_tag.get_attribute('outerHTML') # print(element) link3 = a_tag.get_attribute('href') old_filename = a_tag.get_attribute('href').split('/')[-1] old_filename = parse.unquote(old_filename) new_filename = a_tag.text full_filename = os.path.join(item_dir, new_filename) print(link3, old_filename, new_filename, full_filename) if os.path.exists(full_filename): print(f'------跳过-->文件<{full_filename}>已下载...') continue files.append((old_filename, new_filename, full_filename, link3)) for f1 in files: try: r = requests.get(f1[3], headers=headers) print(r.status_code) with open(f1[2], 'wb') as f: f.write(r.content) except: print(f'>>>except 下载发生错误,跳过下载{f1[3]}') continue k += 1 input('请输入......')
【发文章不易,请多多点赞、关注、支持,谢谢!】