使用框架技术就是方便,在一步步学习各种库的使用基础上,最终还是为了更好的理解和使用现成的框架。这里爬取的是简书网的热门专题信息,包括专题的名字,简介,关注人数收录文章数,最后通过Feed exports功能吧爬到的信息存取到CSV文件中。
zhuantispider.py
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
from zhuanti.items import ZhuantiItem
class zhuanti(CrawlSpider):
name = 'zhuanti'
start_urls = ['https://www.jianshu.com/recommendations/collections?page=1&order_by=hot']
def parse(self, response):
item = ZhuantiItem()
selector = Selector(response)
infos = selector.xpath('//div[@class="col-xs-8"]')
for info in infos:
try:
name = info.xpath('div/a/h4/text()').extract()[0]
content = info.xpath('div/a/p/text()').extract()[0]
article = info.xpath('div/div/a/text()').extract()[0]
fans = info.xpath('div/div/text()').extract()[0]
item['name']=name
item['content'] = content
item['article'] = article
item['fans'] = fans
yield item
except IndexError:
print("Crawl error")
pass
urls = ['https://www.jianshu.com/recommendations/collections?page={}&order_by=hot'.format(str(i)) for i in range(2,37)]
for url in urls:
yield Request(url,callback=self.parse)
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item,Field
class ZhuantiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
content = scrapy.Field()
article = scrapy.Field()
fans = scrapy.Field()
pass
settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
DOWNLOAD_DELAY=0.5
FEED_URI = 'file:F:/python_idea_workspace/Crawl_study/zhuanti/zhuanti.csv'
FEED_FORMAT ='csv'