scrapy genspider -t crawl zwr zedu.com
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ZwrSpider(CrawlSpider):
name = 'zwr'
allowed_domains = ['zwdu.com']
start_urls = ['https://www.zwdu.com/book/10304/']
rules = (
Rule(LinkExtractor(restrict_xpaths=r'''//dd/a'''), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths = r'''//div[@class='bottem1']/a[3]'''), callback = 'parse_item', follow = True),
)
def parse_item(self, response):
title = response.xpath('//h1/text()').extract_first()
content = ''.join(response.xpath('''//div[@id='content']/text()''').extract()).replace(' ', '\n ')
yield {'title': title,
'content': content}