items.py
from scrapy.item import Item, Field
class JianshuItem(Item):
user = Field()
time = Field()
title = Field()
view = Field()
comment = Field()
like = Field()
gain = Field()
jianshuspider.py
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider
from jianshu.items import JianshuItem
class jianshu(CrawlSpider):
name = 'jianshu'
start_urls = ['https://www.jianshu.com/c/bd08b5306eb6?order_by=added_at&page=1']
def parse(self, response):
item = JianshuItem() selector = Selector(response)
infos = selector.xpath('//ul[@class="note-list"]/li')
for info in infos:
user = info.xpath('div/div[1]/div/a/text()').extract()[0]
time = info.xpath('div/div[1]/div/span/@data-shared-at').extract()[0]
title = info.xpath('div/a/text()').extract()[0]
view = info.xpath('div/div[2]/a[1]/text()').extract()[1].strip()
comment = info.xpath('div/div[2]/a[2]/text()').extract()[1].strip()
like = info.xpath('div/div[2]/span[1]/text()').extract()[0].strip()
gain = info.xpath('div/div[2]/span[2]/text()').extract()
if gain:
gain = gain[0].strip()
else:
gain = '0'
item['user'] = user
item['time'] = time
item['title'] = title
item['view'] = view
item['comment'] = comment
item['like'] = like
item['gain'] = gain
yield item
urls = ['https://www.jianshu.com/c/bd08b5306eb6?order_by=added_at&page={}'.format(str(i)) for i in range(2, 3)]
for url in urls:
yield Request(url, callback=self.parse)
settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = { 'jianshu.pipelines.JianshuPipeline': 300,}
pipelines.py
import pymongo
class JianshuPipeline(object):
def __init__(self):
client = pymongo.MongoClient('localhost', 27017)
test = client['test']
jianshu = test['jianshu']
self.post = jianshu
def process_item(self, item, spider):
info = dict(item)
self.post.insert(info)
return item
main.py
main.py
from scrapy import cmdline
cmdline.execute('scrapy crawl jianshu'.split())