利用Python自制批量下载文献程序

19 篇文章 9 订阅
4 篇文章 0 订阅

1、导入库

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.options import Options
import openpyxl
import re
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.select import Select
import urllib.error

2、打开网站并设置网页初始选项

def wu_visual():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    return chrome_options

def fan_jiance():
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    #option.add_argument('-kiosk')
    return option
def url_error_test(url,bro):
    try:
        bro.get(url)
        print("OK")
    except urllib.error.HTTPError as e:
        print(e.code)
        print(e.reason)
    except urllib.error.URLError as e:
        print(e.reason)
    return e.reason

chrome_options=wu_visual()
option=fan_jiance()
chrome_path =r'./chromedriver.exe'
bro = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)

#用火狐的朋友可以把下一行代码的“#”去掉即可
#bro = webdriver.Firefox() 

bro.maximize_window() #最大化
url = r'http://kns.cnki.net' #网址
bro.get(url)

3、关键词搜索

#模拟输入关键字查询
#请选择您需要使用的查询方式,本代码只提供标题查询
input_title = bro.find_element_by_id("txt_SearchText")
input_title.click()
time.sleep(2)
key_value = input("请输入你要下载的论文标题:")

input_title.send_keys(key_value)
#点击搜索
div_search = bro.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[1]/input[2]')
div_search.click()
time.sleep(1)
#点击期刊论文
default_1=20
bro.find_element_by_xpath("/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/span").click()
time.sleep(10)
total_num = bro.find_element_by_xpath("/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/em")
if int(total_num.text)<=default_1:
    print("一共搜索到"+total_num.text+"条结果")
    print("共一页")
else:
    print("一共搜索到" + total_num.text + "条结果")
    total_page =bro.find_element_by_xpath('//*[@id="gridTable"]/div[2]/span[1]')
    print(total_page.text)
    num =int(total_page.text[1:-1])

4、选择下载格式及批量下载到几页

print("1:PDF格式\n2:CAJ格式\n请输入下载文件的格式对应数字:")
load_num = int(input("请输入1 or 2:"))

print("请输入您要下载到第几页码:")

5、开始批量下载

load_page = int(input())
while load_page>num or load_page<=0:
    print("输入页码错误,请重新输入:")
    load_page = int(input("请输入1 or 2:"))
bro_new = webdriver.Chrome(executable_path=chrome_path, chrome_options=chrome_options,options=option)
if int(total_num.text)<=default_1:
    url_link = bro.find_elements_by_xpath('//*[@id="gridTable"]/table/tbody/tr/td[2]/a')
    for link_1 in url_link:
        count=1
        link = url + r'/kcms/detail/detail.aspx?' + link_1.get_attribute("href")[20:]
        bro_new = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)
        bro_new.get(link)
        bro_new.maximize_window()
        # print("编号为"+str(count)+"的论文:"+bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text+"————正在下载")
        time.sleep(10)
        if bro_new.find_element_by_xpath('/html/body/div[2]/div').text == "URL参数错误":
            print("编号为"+str(count)+"的论文:"+bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text+"————论文下载失败")
            bro_new.quit()
            count += 1
            continue
        if load_num == 1:
            bro_new.find_element_by_id('pdfDown').click()
            time.sleep(10)
            print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功")
            count += 1
            bro_new.quit()
        if load_num == 2:
            bro_new.find_element_by_id('cajDown').click()
            time.sleep(10)
            print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功")
            count += 1
            bro_new.quit()
else:
    for ii in range(0,load_page):
        count=1
        url_link = bro.find_elements_by_xpath('//*[@id="gridTable"]/table/tbody/tr/td[2]/a')
        for link_1 in url_link:
            link = url + r'/kcms/detail/detail.aspx?' + link_1.get_attribute("href")[20:]
            bro_new = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)
            bro_new.get(link)
            bro_new.maximize_window()
            time.sleep(10)
            if bro_new.find_element_by_xpath('/html/body/div[2]/div').text == "URL参数错误":
                bro_new.quit()
                print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————论文下载失败")
                bro_new.quit()
                count += 1
                continue
            if load_num == 1:
                bro_new.find_element_by_name('pdfDown').click()
                time.sleep(10)
                print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功")
                count += 1
                bro_new.quit()
            if load_num == 2:
                bro_new.find_element_by_name('cajDown').click()
                time.sleep(5)
                print("编号为" + str(count) + "的论文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下载成功")
                count += 1
                bro_new.quit()
        bro.find_element_by_xpath('//*[@id="PageNext"]').click()
        time.sleep(10)

打包后的PyCNKi.exe程序同步佐佑思维公众号二维码如下:

佐佑思维

  • 0
    点赞
  • 42
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值