知网论文信息获取selenium

from selenium import webdriver
import time
import xlrd
import xlwt
import os
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy
from selenium.webdriver.common.by import By
import re
def get(name1,name2):
    driver = webdriver.Chrome(executable_path='D:\chromweb\chromedriver.exe')

    wait = WebDriverWait(driver, 300)
        # 定义窗口最大化
    driver.maximize_window()
    driver.get(
        'https://kns.cnki.net/kns8/AdvSearch?dbprefix=SCDB&&crossDbcodes=CJFQ%2CCDMD%2CCIPD%2CCCND%2CCISD%2CSNAD%2CBDZK%2CCJFN%2CCCJD')
    wait.until(EC.element_to_be_clickable(
        (By.XPATH, '//ul[@class="search-classify-menu"]/li[4]'))).click()
    # 找到输入框
    input = wait.until(
        EC.presence_of_element_located((By.XPATH, '//textarea[@class="textarea-major ac_input"]'))
    )
    # 类点击,然后输入查询主题或关键字
    input.clear()
    input.send_keys(name1)
    # 显示等待
    wait.until(
        EC.element_to_be_clickable((By.XPATH, '//input[@class="btn-search"]'))
    ).click()

    time.sleep(3)
    total = driver.find_element_by_xpath('//*[@id="countPageDiv"]/span/em')
    total = total.text
    print(name2 + "一共有" + total + "条数据")
    total = re.sub("\D", "", total)
    page = (int(total) // 20) + 1
    print('一共有{}'.format(page) + '页文章')
    df = xlwt.Workbook()
    sheet1 = df.add_sheet('bookname', cell_overwrite_ok=True)
    rowsTitle = [u'题名', u'来源', u'发表时间', u'数据库']
    for i in range(len(rowsTitle)):
        sheet1.write(0, i, rowsTitle[i])

    for p in range(page):
        for i in range(1, 21):
            try:
                ctitle = driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[{}]/td[2]'.format(i)).text
                csource = driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[{}]/td[4]/a'.format(i)).text
                cdatatime = driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[{}]/td[5]'.format(i)).text
                cdatabase = driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[{}]/td[6]'.format(i)).text

                sheet1.write(p*20+i, 0, ctitle)
                sheet1.write(p*20+i, 1, csource)
                sheet1.write(p*20+i, 2, cdatatime)
                sheet1.write(p*20+i, 3, cdatabase)
                df.save('E:\cnki\信息.xls')
            except:
                print(str(int(p)+1)+'页未抓取到')
        print('已抓取第'+str(int(p)+1)+'页')
        flag4 = driver.find_element_by_xpath('//*[@id="PageNext"]')

        driver.execute_script("arguments[0].scrollIntoView();", flag4)
        flag4.click()
        time.sleep(10)

def isElementExist(element):
    flag = True
    try:
        driver.find_element_by_xpath(element)
        return flag
    except:
        flag = False
        return flag
if __name__ == '__main__':
    name1 = "TI='经济'"
    name2 = '经济'
    get(name1,name2)

这个和之前的那个相似,但是目的是为了获取论文信息,而不是下载论文。

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值