进行模拟爬取知网
运行selenium进行模拟登录,后进行爬取数据
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from urllib.parse import urljoin
import time
import random
import json
import warnings
warnings.filterwarnings("ignore")
class zhiwang():
def __init__ (self):
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
self.browser = webdriver.Chrome(options=options)
self.url = 'https://kns.cnki.net/kns/brief/result.aspx?dbprefix=CJFQ'
self.count = 1
def login(self):
self.browser.get(self.url)
WebDriverWait(self.browser, 1000).until(
EC.presence_of_element_located(
(By.ID, 'txt_1_value1')
)
)
if self.browser.find_elements_by_xpath('//*[@id="txt_1_value1"]'):
search = self.browser.find_element_by_xpath('//*[@id="txt_1_value1"]')
search.send_keys('格斯尔')
submit_search = self.browser.find_element_by_id('btnSearch')
submit_search.click()
self.browser.switch_to.frame('iframeResult')
WebDriverWait(self.browser, 1000).until(
EC.presence_of_element_located(
(By.CLASS_NAME, 'Ch-En')
)
)
submit_chinese = self.browser.find_element_by_link_text("中文文献")
submit_chinese.click()
submit_page = self.browser.find_element_by_xpath('//*[@id="id_grid_display_num"]/a[3]')
submit_page.click()
zhiwang().login()