# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.linkextractors import LinkExtractor
from CrawlSpiderTest.items import CrawlspidertestItem
class CsdnarticleSpider(CrawlSpider):
name = 'csdnArticle'
allowed_domains = ['blog.csdn.net']
start_urls = ['https://blog.csdn.net/u012150179/article/details/11749017']
pagelink = LinkExtractor(allow=('/u012150179/article/details'))
rules = [
Rule(pagelink, callback='parse_item', follow=True)
]
def parse_item(self, response):
item = CrawlspidertestItem()
item['title'] = response.css('.title-article::text').extract_first()
yield item
# def parse(self, response):
# pass
http://www.waitingfy.com/archives/3937