1、找到一个比官网更容易分析的接口,但貌似里头的信息不全,只有前76页的信息。
2、如何让scrapy按顺序爬取页面:
1)直接从网页中抓取下一页的url,不要自己定义:
# -*- coding: utf-8 -*-
import scrapy
import re
import time
from CNKISpider.items import CnkispiderItem
import urllib.request
from urllib.error import URLError
from lxml import etree
class CnSpider(scrapy.Spider):
name = 'CN'
allowed_domains = ['search.cnki.com.cn']
start_urls = ['http://search.cnki.com.cn/Search.aspx?q=%e8%87%aa%e7%84%b6%e8%af%ad%e8%a8%80%e5%a4%84%e7%90%86&rank=citeNumber&cluster=all&val=&p=0',
]
def parse(self, response):
try:
url = response.xpath('.//*[@id="page"]/a[11]/@href').extract()[0]
except:
url = response.xpath('.//*[@id="page"]/a[10]/@href').extract()[0]
url_1 = 'http://search.cnki.com.cn/'
task_url = url_1 + url
print('****************爬取task_url: ', task_url ,' **************************')
time.sleep(1)
if response.url != task_url:
yield scrapy.Request(url=task_url, callback=self.parse)
else:
print('\n************************无更多任务*****************************')
2)自定义要爬取的url,解析一个丢一个。
# -*- coding: utf-8 -*-
import scrapy
import re
import time
from CNKISpider.items import CnkispiderItem
import urllib.request
from urllib.error import URLError
from lxml import etree
class CnSpider(scrapy.Spider):
name = 'CN'
allowed_domains = ['search.cnki.com.cn']
start_urls = ['http://search.cnki.com.cn/Search.aspx?q=%e8%87%aa%e7%84%b6%e8%af%ad%e8%a8%80%e5%a4%84%e7%90%86&rank=citeNumber&cluster=all&val=&p=0',
]
task_urls = []
for i in range(0, 70):
task_url = 'http://search.cnki.com.cn/Search.aspx?q=%e8%87%aa%e7%84%b6%e8%af%ad%e8%a8%80%e5%a4%84%e7%90%86&rank=citeNumber&cluster=all&val=&p='+ str(15*i)
task_urls.append(task_url)
print('共爬取',len(task_urls),'页')
def parse(self, response):
count = len(self.task_urls) - 1
if count != 0:
print('剩余', count, '个页面\n')
if first_url == urls[0] and last_url == urls[-1]:
self.task_urls.remove(response.url)
print('\n\n**************task_urls[0]:', self.task_urls[0],'****************')
yield scrapy.Request(url=self.task_urls[0], callback=self.parse)
else:
print('\n************************无更多任务*****************************')
以上。