douban.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from crawl_douban.items import CrawlDoubanItem
class DoubanSpider(CrawlSpider):
name = 'douban'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/subject/1292052/']
rules = (
Rule(LinkExtractor(allow=r'\/subject\/\d+\/\?'), callback='parse_item', follow=True),
)
def parse_item(self, response):
movie_list=[]
for each in response.xpath("//div[@id='content']"):
item=CrawlDoubanItem()
item['Director']=each.xpath(".//span/a[contains(@rel,'v:directedBy')]/text()").extract()[0]
item['MovieName']=each.xpath("//span[@property='v:itemreviewed']/text()").extract()[0]
item['Type']=each.xpath(".//span[@property='v:genre']/text()").extract()
item['Year']=each.xpath(".//span[@class='year']/text()").extract()[0][1:-1]
item['Grade']=each.xpath("//div[@id='content']//strong[@class='ll rating_num']/text()").extract()[0]
movie_list.append(item)
yield item
return movie_list
pinelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class CrawlDoubanPipeline(object):
def __init__(self):
self.output=open('movie.json','w')
def process_item(self, item, spider):
jsontext=json.dumps(dict(item))+',\n'
self.output.write(jsontext)
return item
def close_spider(self,spider):
self.output.close()
导出成CSV格式:
一共1W8+