Scrapy实战简书网专题收录的文章信息

items.py

from scrapy.item import Item, Field
    class JianshuItem(Item): 
        user = Field()    
        time = Field() 
        title = Field() 
        view = Field() 
        comment = Field()
        like = Field()
        gain = Field()

 jianshuspider.py

from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider
from jianshu.items import JianshuItem

class jianshu(CrawlSpider):
    name = 'jianshu' 
    start_urls = ['https://www.jianshu.com/c/bd08b5306eb6?order_by=added_at&page=1'] 
    def parse(self, response):
        item = JianshuItem() selector = Selector(response) 
        infos = selector.xpath('//ul[@class="note-list"]/li') 
        for info in infos: 
            user = info.xpath('div/div[1]/div/a/text()').extract()[0] 
            time = info.xpath('div/div[1]/div/span/@data-shared-at').extract()[0] 
            title = info.xpath('div/a/text()').extract()[0] 
            view = info.xpath('div/div[2]/a[1]/text()').extract()[1].strip() 
            comment = info.xpath('div/div[2]/a[2]/text()').extract()[1].strip() 
            like = info.xpath('div/div[2]/span[1]/text()').extract()[0].strip() 
            gain = info.xpath('div/div[2]/span[2]/text()').extract() 
            if gain: 
                gain = gain[0].strip() 
            else: 
                gain = '0' 
            item['user'] = user 
            item['time'] = time 
            item['title'] = title 
            item['view'] = view 
            item['comment'] = comment 
            item['like'] = like 
            item['gain'] = gain 

            yield item 

            urls = ['https://www.jianshu.com/c/bd08b5306eb6?order_by=added_at&page={}'.format(str(i)) for i in range(2, 3)] 
            for url in urls: 
                yield Request(url, callback=self.parse)


settings.py

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = { 'jianshu.pipelines.JianshuPipeline': 300,}

 pipelines.py

import pymongo

class JianshuPipeline(object):
    def __init__(self): 
        client = pymongo.MongoClient('localhost', 27017) 
        test = client['test'] 
        jianshu = test['jianshu'] 
        self.post = jianshu 
    def process_item(self, item, spider): 
        info = dict(item) 
        self.post.insert(info) 

        return item

main.py

 main.py

from scrapy import cmdline

cmdline.execute('scrapy crawl jianshu'.split())

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值