from selenium import webdriver
import time
import xlwt
import os
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# import numpy
from selenium.webdriver.common.by import By
import re
# import pymysql
class cnki():
# 初始化
def __init__(self):
self.driver = webdriver.Chrome(executable_path='D:\chromweb\chromedriver.exe')
self.wait = WebDriverWait(self.driver, 300)
self.driver.maximize_window()
# 开始抓取
def get_info(self,name1,name2):
try:
self.driver.get('https://kns.cnki.net/kns8/AdvSearch?dbprefix=SCDB&&crossDbcodes=CJFQ%2CCDMD%2CCIPD%2CCCND%2CCISD%2CSNAD%2CBDZK%2CCJFN%2CCCJD')
# 利用显示等待点击进入高级检索页面
self.wait.until(EC.element_to_be_clickable(
(By.XPATH, '//ul[@class="search-classify-menu"]/li[4]'))).click()
# 找到输入框
input = self.wait.until(
EC.presence_of_element_located((By.XPATH, '//textarea[@class="textarea-major ac_input"]'))
)
# 类点击,然后输入查询主题或关键字
input.clear()
input.send_keys(name1)
# 显示等待
self.wait.until(
EC.element_to_be_clickable((By.XPATH, '//input[@class="btn-search"]'))
).click()
time.sleep(3)
total = self.driver.find_element_by_xpath('//*[@id="countPageDiv"]/span/em').text
print(name2 + "一共有" + total + "条数据")
total = re.sub("\D", "", total)
page = (int(total) // 20) + 1
print('一共有{}'.format(page)+'页文章')
a = 1
for p in range(page):
for i in range(1, 21):
link = self.driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[%d]/td[2]/a' % i)
print(link)
flag1 = self.isElementExist('//*[@id="gridTable"]/table/tbody/tr[%d]/td[2]' % i)
if flag1:
# 将该模块与浏览器顶部对齐
self.driver.execute_script("arguments[0].scrollIntoView();", link)
time.sleep(3)
# 使用鼠标的操作
actions = ActionChains(self.driver)
actions.move_to_element(link)
actions.click(link)
actions.perform()
time.sleep(10)
# 切换页面
windows = self.driver.window_handles
self.driver.switch_to.window(windows[-1])
time.sleep(3)
# 寻找pdf下载按键位置是否可以下载,之后进行下载
try:
flag2 = self.isElementExist('//*[@id="pdfDown"]')
if flag2:
pdf = self.driver.find_element_by_xpath('//*[@id="pdfDown"]')
self.driver.execute_script("arguments[0].scrollIntoView();", pdf)
time.sleep(3)
self.wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="pdfDown"]'))).click()
else:
print('错误')
pass
except Exception as exc:
print(exc)
time.sleep(10)
# 关闭当前已下载好的页面
self.driver.close()
time.sleep(5)
self.driver.switch_to.window(windows[0])
print("-----正在爬取--" + name2 + '--药品的第' + str(int(p) + 1) + '页' + str(a) + "条数据------")
a = a + 1
else:
print('cw')
break
flag3 = self.isElementExist('//*[@id="PageNext"]')
# 点击下一页
if flag3:
time.sleep(10)
next_page = self.driver.find_element_by_xpath('//*[@id="PageNext"]')
self.driver.execute_script("arguments[0].scrollIntoView();", next_page)
self.wait.until(EC.element_to_be_clickable(
(By.XPATH, '//*[@id="PageNext"]'))).click()
time.sleep(15)
else:
break
except Exception as exc:
print(exc)
def isElementExist(self, element):
flag = True
try:
self.driver.find_element_by_xpath(element)
return flag
except:
flag = False
return flag
if __name__ == '__main__':
name1 = "TI='经济与环境保护'"
name2 = '经济'
run = cnki()
run.get_info(name1,name2)
注释在里面都有写,大家可以自行观看,有需要交流的地方可以评论!