利用Python自制批量下载文献程序_xmol文献不能批量下载吗-CSDN博客

本文链接：https://blog.csdn.net/weixin_45288557/article/details/111057810

批量下载文献

1、导入库
2、打开网站并设置网页初始选项
3、关键词搜索
4、选择下载格式及批量下载到几页
5、开始批量下载
打包后的PyCNKi.exe程序同步佐佑思维公众号二维码如下：

1、导入库

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.options import Options
import openpyxl
import re
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.select import Select
import urllib.error

2、打开网站并设置网页初始选项

def wu_visual():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    return chrome_options

def fan_jiance():
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    #option.add_argument('-kiosk')
    return option
def url_error_test(url,bro):
    try:
        bro.get(url)
        print("OK")
    except urllib.error.HTTPError as e:
        print(e.code)
        print(e.reason)
    except urllib.error.URLError as e:
        print(e.reason)
    return e.reason

chrome_options=wu_visual()
option=fan_jiance()
chrome_path =r'./chromedriver.exe'
bro = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)

#用火狐的朋友可以把下一行代码的“#”去掉即可
#bro = webdriver.Firefox() 

bro.maximize_window() #最大化
url = r'http://kns.cnki.net' #网址
bro.get(url)

3、关键词搜索

#模拟输入关键字查询
#请选择您需要使用的查询方式，本代码只提供标题查询
input_title = bro.find_element_by_id("txt_SearchText")
input_title.click()
time.sleep(2)
key_value = input("请输入你要下载的论文标题：")

input_title.send_keys(key_value)
#点击搜索
div_search = bro.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[1]/input[2]')
div_search.click()
time.sleep(1)
#点击期刊论文
default_1=20
bro.find_element_by_xpath("/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/span").click()
time.sleep(10)
total_num = bro.find_element_by_xpath("/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/em")
if int(total_num.text)<=default_1:
    print("一共搜索到"+total_num.text+"条结果")
    print("共一页")
else:
    print("一共搜索到" + total_num.text + "条结果")
    total_page =bro.find_element_by_xpath('//*[@id="gridTable"]/div[2]/span[1]')
    print(total_page.text)
    num =int(total_page.text[1:-1])

4、选择下载格式及批量下载到几页

print("1:PDF格式\n2:CAJ格式\n请输入下载文件的格式对应数字：")
load_num = int(input("请输入1 or 2："))

print("请输入您要下载到第几页码：")

5、开始批量下载

load_page = int(input())
while load_page>num or load_page<=0:
    print("输入页码错误，请重新输入：")
    load_page = int(input("请输入1 or 2："))
bro_new = webdriver.Chrome(executable_path=chrome_path, chrome_options=chrome_options,options=option)
if int(total_num.text)<=default_1:
    url_link = bro.find_elements_by_xpath('//*[@id="gridTable"]/table/tbody/tr/td[2]/a')
    for link_1 in url_link:
        count=1
        link = url + r'/kcms/detail/detail.aspx?' + link_1.get_attribute("href")[20:]
        bro_new = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)
        bro_new.get(link)
        bro_new.maximize_window()
        # print("编号为"+str(count)+"的论文："+bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text+"————正在下载")
        time.sleep(10)
        if bro_new.find_element_by_xpath('/html/body/div[2]/div').text == "URL参数错误":
            print("编号为"+str(count)+"的论文："+bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text+"————论文下载失败")
            bro_new.quit()
            count += 1
            continue
        if load_num == 1:
            bro_new.find_element_by_id('pdfDown').click()
            time.sleep(10)
            print("编号为" + str(count) + "的论文：" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功")
            count += 1
            bro_new.quit()
        if load_num == 2:
            bro_new.find_element_by_id('cajDown').click()
            time.sleep(10)
            print("编号为" + str(count) + "的论文：" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功")
            count += 1
            bro_new.quit()
else:
    for ii in range(0,load_page):
        count=1
        url_link = bro.find_elements_by_xpath('//*[@id="gridTable"]/table/tbody/tr/td[2]/a')
        for link_1 in url_link:
            link = url + r'/kcms/detail/detail.aspx?' + link_1.get_attribute("href")[20:]
            bro_new = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)
            bro_new.get(link)
            bro_new.maximize_window()
            time.sleep(10)
            if bro_new.find_element_by_xpath('/html/body/div[2]/div').text == "URL参数错误":
                bro_new.quit()
                print("编号为" + str(count) + "的论文：" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————论文下载失败")
                bro_new.quit()
                count += 1
                continue
            if load_num == 1:
                bro_new.find_element_by_name('pdfDown').click()
                time.sleep(10)
                print("编号为" + str(count) + "的论文：" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功")
                count += 1
                bro_new.quit()
            if load_num == 2:
                bro_new.find_element_by_name('cajDown').click()
                time.sleep(5)
                print("编号为" + str(count) + "的论文：" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功")
                count += 1
                bro_new.quit()
        bro.find_element_by_xpath('//*[@id="PageNext"]').click()
        time.sleep(10)