最近想对单位发表的期刊情况做一个文献计量学分析,数据倒也不多,但是手动保存优点麻烦。两年前用spynner.py做过抓取CNKI管理后台,spynner.py是基于qtwebkit的一个包,跟ghost.py基本差不多,这也算跟selenium有一定的渊源,spynner基本弃坑了,近几年一直没再动过。CNKI网站为了反爬,做的太复杂,平时自己访问都感觉慢,一看源代码,加载了一堆js,能不慢吗?
Python2,selenium操作如下(仅关键部分,代码不全):
# coding: utf-8
import time
import datetime
import sys
import os
import random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
'''
manipulate
'''
#左键点击
def common_click(driver,element_id,sleep_time=3):
actions = ActionChains(driver)
actions.move_to_element(element_id)
actions.click(element_id)
actions.perform()
time.sleep(sleep_time)
#鼠标悬浮
def common_hover(driver,element_id,sleep_time=3):
actions = ActionChains(driver)
actions.move_to_element(element_id)
actions.perform()
time.sleep(sleep_time)
def print_scr(driver,filename):
driver.get_screenshot_as_file(filename)
#填充表单
def fill_text(driver,element,content):
element.clear()
element.send_keys(content)
time.sleep(0.5)
#---------------------------------
def page_config():
driver.get('http://yuanjian.cnki.com.cn/home/?type=corpus')
addtime=time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())
time.sleep(3)
search_type=driver.find_element_by_class_name("item")
common_click(driver,search_type)
print('发现搜索类型!')
search_type_unit=driver.find_element_by_xpath('//li[@val="unit"]')
common_click(driver,search_type_unit)
print('设置搜索类型为单位!')
search_input = driver.find_element_by_class_name('MulSerchKey')
#print len(search_input)
fill_text(driver,search_input,'你的单位'.decode('gb18030'))
print('填写搜索框内容!')
search_btn=driver.find_element_by_class_name('search')
common_click(driver,search_btn)
print('获得搜索结果!')
time.sleep(10)
list_style=driver.find_element_by_class_name('zyxz')
common_click(driver,list_style)
print('切换显示样式!')
list_order_btn=driver.find_element_by_class_name('rank')
common_hover(driver,list_order_btn)
list_order= driver.find_element_by_xpath('//a[@onclick="Order(2);"]')
common_click(driver,list_order,10)
print('切换为按时间排序!')
print('配置完成,开始抓取页面')
print '-'*60
driver.get_screenshot_as_file(addtime+'.png')
SaveAsLocalFile('1.html',driver.page_source,write_type='a+')
next_page=driver.find_element_by_link_text("下一页>".decode('gb18030'))
print next_page.text
common_click(driver,next_page,10)
SaveAsLocalFile('2.html',driver.page_source,write_type='a+')
if __name__ == '__main__':
#init driver
DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.loadImages'] = False
DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 "
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.set_script_timeout(30)
driver.set_page_load_timeout(30)
#print 'Browser driver initialized!'
print '******System initialized!******'
#Main----------------------------------------------------------
page_config()
#Exit----------------------------------------------------------
driver.quit()