这是item.py
import scrapy class JianshuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() name = scrapy.Field() time = scrapy.Field() abstract = scrapy.Field()
这是主文件
import scrapy, sys from..items import JianshuItem class JianshushuSpider(scrapy.Spider): name = 'jianshushu' allowed_domains = ['jianshu.com'] start_urls = ['http://jianshu.com/'] def parse(self, response): item = JianshuItem() ul = response.xpath('//ul[@class="note-list"]/li[@class="have-img"]') for i in ul: item['title'] = i.xpath('.//a[@class="title"]/text()').extract()[0] item['name'] = i.xpath('.//a[@class="nickname"]/text()').extract()[0] item['abstract'] = i.xpath('.//p[@class="abstract"]/text()').extract()[0] item['time'] = i.xpath('.//span[@class="time"]/@data-shared-at').extract()[0] yield item
settings.py 记得加两句
DEFAULT_REQUEST_HEADERS = { 'accept-language': 'zh-CN,zh;q=0.8', 'user-agent': 'Mozilla/5.0 (Windows NT 6.3)' } FEED_EXPORT_ENCODING = 'utf-8' # 方便json写出
最后
-o表示文件写入目录 -t 表示格式,执行完后打开test.json 如图:
scrapy crawl jianshu -o test.json -t json