爬虫主体:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from js_spi.items import ArticleItem
class JsSpider(CrawlSpider):
name = 'js'
allowed_domains = ['jianshu.com']
start_urls = ['https://www.jianshu.com/']
rules = (
Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_page', follow=True),
)
def parse_page(self, response):
title = response.xpath('//section[@class="ouvJEz"]/h1/text()').get()
author = response.xpath('//span[@class="FxYr8x"]/a/text()').get()
edit_time = response.xpath('//div[@class="s-dsoj"]//time/text()').get()
content = response.xpath('//article[@class="_2rhmJa"]').getall()
fav_count = response.xpath('//span[@class="_1LOh_5"]/text()').get()
text_type = '|'.join(response.xpath('//a[@class="_3s5t0Q _1OhGeD"]/span/text()').getall())
item = ArticleItem(title=title,
author=author,
edit_time=edit_time,
content=content,
fav_count=fav_count,
text_type=text_type)