爬虫---爬取豆瓣-科幻片-排行

scrapy

 

movie.py

# -*- coding: utf-8 -*-
import scrapy
import json
import re

from douban.items import DoubanItem


class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=0&limit=20']

def parse(self, response):
item = DoubanItem()
datas = json.loads(response.body)
if datas:
for data in datas:
item['movie_rank'] = data['rank']
item['movie_name'] = data['title']
item['movie_score'] = data['score']
item['movie_people'] = data['vote_count']
# print item
yield item
a = int(re.findall('.*start=(\d+).*', response.url)[0])
if a < 201:
url = 'https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=' + str(a + 20) + '&limit=20'
print url
yield scrapy.Request(url, callback=self.parse)


 

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
movie_rank = scrapy.Field()
movie_name = scrapy.Field()
movie_score = scrapy.Field()
movie_people = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class DoubanPipeline(object):
def process_item(self, item, spider):
with open('./douban_movie.txt', 'a+') as f:
# cont = str(item['movie_rank']) + ' ' + item['movie_name'].encode('utf8') + ' ' + item['movie_score'].encode('utf8') + ' ' + str(item['movie_people']) + '\n'
cont = str(item['movie_rank']).ljust(3, ' ') + ' ' + item['movie_name'].encode('utf8') + ' ' + item['movie_score'].encode('utf8') + '\n'
f.write(cont)

 

mian.py

# -*- coding:utf-8 -*-
from scrapy import cmdline
file = open('./douban_movie.txt', 'w+')
file.close()
cmdline.execute('scrapy crawl movie'.split())

保存结果   txt文件

1       盗梦空间             9.3
2 机器人总动员 9.3
3 星际穿越 9.2
4 楚门的世界 9.2
5 超感猎杀:完结特别篇 9.2
6 蝙蝠侠:黑暗骑士 9.1
7 攻壳机动队2:无罪 9.1
 

转载于:https://www.cnblogs.com/wozuilang-mdzz/p/9740418.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值