db250.py
import scrapy
from ..items import Db2Item
class Db252Spider(scrapy.Spider):
name = "db252"
allowed_domains = ["movie.douban.com"]
url='https://movie.douban.com/top250?start={}&filter='
format(0)
def start_requests(self):
for i in range(0,2):
urls=self.url.format(i*25)
yield scrapy.Request(url=urls,callback=self.parse)
def parse(self, response):
node_list=response.xpath('//div[@class="info"]')
for i in node_list:
name = i.xpath('./div/a/span/text()').get()
rate = i.xpath('./div/div/span[@class="rating_num"]/text()').get()
sec_url=i.xpath('./div[1]/a/@href').get()
yield scrapy.Request(url=sec_url,callback=self.node2,meta={'name':name,'rate':rate})
def node2(self,response):
name=response.meta.get('name')
rate=response.meta.get('rate')
description=response.xpath('//*[@id="link-report-intra"]/span[1]/text()|//*[@id="link-report-intra"]/span[1]/span/text()').getall()
description=[i.strip() for i in description]
description=''.join(description)
item=Db2Item()
item['name']=name
item['rate']=rate
item['description']=description
yield item
pipelines.py
import json
class Db2Pipeline:
def open_spider(self,spider):
self.f=open('db252.txt','w',encoding='utf-8')
def process_item(self, item, spider):
line=json.dumps(dict(item),ensure_ascii=False)+'\n'
self.f.write(line)
return item
def close_spider(self,spider):
self.f.close()