# -*- coding: utf-8 -*-
import time
import urllib
import urllib2
import cookielib
from lxml import etree
import random
'''
爬取第一页,获取共页数
爬取第二页至最后一页
'''
# 下载当前页所有文章的pdf或caj
def download_paper(treedata, opener, localdir):
'''
传入参数:
treedata:当前列表页的treedata数据
opener: referer已修改为当前页
localdir: 保存目录
'''
tr_node = treedata.xpath("//tr[@bgcolor='#f6f7fb']|//tr[@bgcolor='#ffffff']")
for item in tr_node:
paper_title = item.xpath("string(td/a[@class='fz14'])")
paper_link = item.xpath("td/a[@class='fz14']/@href")
paper_author = item.xpath("td[@class='author_flag']/a/text()")
paper_source = item.xpath("td[4]/a/text()")
paper_pub_date = item.xpath("td[5]/text()")
paper_db = item.xpath("td[6]/text()")
paper_cited = item.xpath("td[7]//a/text()")
paper_download_count = item.xpath("td[8]/span/a/text()")
print paper_title
print paper_link
# 获取paper详情页面链接,访问详情页前,要设置referer
paper_detail_url_fake = "http://kns.cnki.net" + paper_link[0]
response = opener.open(paper_detail_url_fake)
paper_detail_page_treedata = etree.HTML(response.read())
# 下载前要设置referer为详情页
opener.addheaders = [("Referer", response.url)]