Selenium操作CNKI(中国知网)网站的远见搜索

最近想对单位发表的期刊情况做一个文献计量学分析,数据倒也不多,但是手动保存优点麻烦。两年前用spynner.py做过抓取CNKI管理后台,spynner.py是基于qtwebkit的一个包,跟ghost.py基本差不多,这也算跟selenium有一定的渊源,spynner基本弃坑了,近几年一直没再动过。CNKI网站为了反爬,做的太复杂,平时自己访问都感觉慢,一看源代码,加载了一堆js,能不慢吗?

Python2,selenium操作如下(仅关键部分,代码不全):

# coding: utf-8

import time
import datetime
import sys
import os
import random

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

'''
manipulate
'''
#左键点击
def common_click(driver,element_id,sleep_time=3):
    actions = ActionChains(driver)
    actions.move_to_element(element_id)
    actions.click(element_id)
    actions.perform()
    time.sleep(sleep_time) 
#鼠标悬浮
def common_hover(driver,element_id,sleep_time=3):
    actions = ActionChains(driver)
    actions.move_to_element(element_id)
    actions.perform()
    time.sleep(sleep_time) 

def print_scr(driver,filename):
    driver.get_screenshot_as_file(filename)
#填充表单   
def fill_text(driver,element,content):
    element.clear()
    element.send_keys(content)
    time.sleep(0.5)
#---------------------------------
def page_config():
    driver.get('http://yuanjian.cnki.com.cn/home/?type=corpus')
    addtime=time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())   
    time.sleep(3)

    search_type=driver.find_element_by_class_name("item")
    common_click(driver,search_type)
    print('发现搜索类型!')
    search_type_unit=driver.find_element_by_xpath('//li[@val="unit"]')
    common_click(driver,search_type_unit)
    print('设置搜索类型为单位!')
    search_input = driver.find_element_by_class_name('MulSerchKey')
    #print len(search_input)
    fill_text(driver,search_input,'你的单位'.decode('gb18030'))
    print('填写搜索框内容!')
    search_btn=driver.find_element_by_class_name('search')
    common_click(driver,search_btn)
    print('获得搜索结果!')
    time.sleep(10)
    list_style=driver.find_element_by_class_name('zyxz')
    common_click(driver,list_style)
    print('切换显示样式!')
    list_order_btn=driver.find_element_by_class_name('rank')
    common_hover(driver,list_order_btn)
    list_order= driver.find_element_by_xpath('//a[@onclick="Order(2);"]')
    common_click(driver,list_order,10)
    print('切换为按时间排序!')
    print('配置完成,开始抓取页面')
    print '-'*60
    driver.get_screenshot_as_file(addtime+'.png')
    SaveAsLocalFile('1.html',driver.page_source,write_type='a+')
    next_page=driver.find_element_by_link_text("下一页>".decode('gb18030'))
    print next_page.text
    common_click(driver,next_page,10)
    SaveAsLocalFile('2.html',driver.page_source,write_type='a+')

if __name__ == '__main__':
    #init driver
    DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.loadImages'] = False  
    DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 "

    driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
    driver.set_script_timeout(30)
    driver.set_page_load_timeout(30)
    #print 'Browser driver initialized!'
    print '******System initialized!******'
#Main----------------------------------------------------------
    page_config()
#Exit----------------------------------------------------------
    driver.quit()
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值