1、导入库
from selenium import webdriver
from selenium. webdriver import ChromeOptions
from selenium. webdriver. chrome. options import Options
import openpyxl
import re
import time
from selenium. webdriver. common. by import By
from selenium. webdriver. support. ui import WebDriverWait
from selenium. webdriver. support. select import Select
import urllib. error
2、打开网站并设置网页初始选项
def wu_visual ( ) :
chrome_options = Options( )
chrome_options. add_argument( '--headless' )
chrome_options. add_argument( '--disable-gpu' )
return chrome_options
def fan_jiance ( ) :
option = ChromeOptions( )
option. add_experimental_option( 'excludeSwitches' , [ 'enable-automation' ] )
return option
def url_error_test ( url, bro) :
try :
bro. get( url)
print ( "OK" )
except urllib. error. HTTPError as e:
print ( e. code)
print ( e. reason)
except urllib. error. URLError as e:
print ( e. reason)
return e. reason
chrome_options= wu_visual( )
option= fan_jiance( )
chrome_path = r'./chromedriver.exe'
bro = webdriver. Chrome( executable_path= chrome_path, chrome_options= chrome_options, options= option)
bro. maximize_window( )
url = r'http://kns.cnki.net'
bro. get( url)
3、关键词搜索
input_title = bro. find_element_by_id( "txt_SearchText" )
input_title. click( )
time. sleep( 2 )
key_value = input ( "请输入你要下载的论文标题:" )
input_title. send_keys( key_value)
div_search = bro. find_element_by_xpath( '/html/body/div[1]/div[2]/div/div[1]/input[2]' )
div_search. click( )
time. sleep( 1 )
default_1= 20
bro. find_element_by_xpath( "/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/span" ) . click( )
time. sleep( 10 )
total_num = bro. find_element_by_xpath( "/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/em" )
if int ( total_num. text) <= default_1:
print ( "一共搜索到" + total_num. text+ "条结果" )
print ( "共一页" )
else :
print ( "一共搜索到" + total_num. text + "条结果" )
total_page = bro. find_element_by_xpath( '//*[@id="gridTable"]/div[2]/span[1]' )
print ( total_page. text)
num = int ( total_page. text[ 1 : - 1 ] )
4、选择下载格式及批量下载到几页
print ( "1:PDF格式\n2:CAJ格式\n请输入下载文件的格式对应数字:" )
load_num = int ( input ( "请输入1 or 2:" ) )
print ( "请输入您要下载到第几页码:" )
5、开始批量下载
load_page = int ( input ( ) )
while load_page> num or load_page<= 0 :
print ( "输入页码错误,请重新输入:" )
load_page = int ( input ( "请输入1 or 2:" ) )
bro_new = webdriver. Chrome( executable_path= chrome_path, chrome_options= chrome_options, options= option)
if int ( total_num. text) <= default_1:
url_link = bro. find_elements_by_xpath( '//*[@id="gridTable"]/table/tbody/tr/td[2]/a' )
for link_1 in url_link:
count= 1
link = url + r'/kcms/detail/detail.aspx?' + link_1. get_attribute( "href" ) [ 20 : ]
bro_new = webdriver. Chrome( executable_path= chrome_path, chrome_options= chrome_options, options= option)
bro_new. get( link)
bro_new. maximize_window( )
time. sleep( 10 )
if bro_new. find_element_by_xpath( '/html/body/div[2]/div' ) . text == "URL参数错误" :
print ( "编号为" + str ( count) + "的论文:" + bro_new. find_element_by_xpath( "/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1" ) . text+ "————论文下载失败" )
bro_new. quit( )
count += 1
continue
if load_num == 1 :
bro_new. find_element_by_id( 'pdfDown' ) . click( )
time. sleep( 10 )
print ( "编号为" + str ( count) + "的论文:" + bro_new. find_element_by_xpath( "/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1" ) . text + "————下载成功" )
count += 1
bro_new. quit( )
if load_num == 2 :
bro_new. find_element_by_id( 'cajDown' ) . click( )
time. sleep( 10 )
print ( "编号为" + str ( count) + "的论文:" + bro_new. find_element_by_xpath( "/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1" ) . text + "————下载成功" )
count += 1
bro_new. quit( )
else :
for ii in range ( 0 , load_page) :
count= 1
url_link = bro. find_elements_by_xpath( '//*[@id="gridTable"]/table/tbody/tr/td[2]/a' )
for link_1 in url_link:
link = url + r'/kcms/detail/detail.aspx?' + link_1. get_attribute( "href" ) [ 20 : ]
bro_new = webdriver. Chrome( executable_path= chrome_path, chrome_options= chrome_options, options= option)
bro_new. get( link)
bro_new. maximize_window( )
time. sleep( 10 )
if bro_new. find_element_by_xpath( '/html/body/div[2]/div' ) . text == "URL参数错误" :
bro_new. quit( )
print ( "编号为" + str ( count) + "的论文:" + bro_new. find_element_by_xpath( "/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1" ) . text + "————论文下载失败" )
bro_new. quit( )
count += 1
continue
if load_num == 1 :
bro_new. find_element_by_name( 'pdfDown' ) . click( )
time. sleep( 10 )
print ( "编号为" + str ( count) + "的论文:" + bro_new. find_element_by_xpath( "/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1" ) . text + "————下载成功" )
count += 1
bro_new. quit( )
if load_num == 2 :
bro_new. find_element_by_name( 'cajDown' ) . click( )
time. sleep( 5 )
print ( "编号为" + str ( count) + "的论文:" + bro_new. find_element_by_xpath( "/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1" ) . text + "————下载成功" )
count += 1
bro_new. quit( )
bro. find_element_by_xpath( '//*[@id="PageNext"]' ) . click( )
time. sleep( 10 )
打包后的PyCNKi.exe程序同步佐佑思维公众号二维码如下: