python爬取豆瓣电影

如果爬取豆瓣电影时,IP被封,就连接其他wifi爬,比如手机热点。好可以使用付费代理IP

爬取豆瓣电影中:华语、欧美、韩国、日本电影每个标签下按评价排序的全部电影。需要如下信息:

(1)每个电影的电影名、导演、编剧、主演、类型、国家、上映日期、片长,电影评分,以及每个星级评分的百分比数据。

(2)每个电影热门点评中的前100个评分及其评分人。

(3)进入每个评分人的主页,爬取其看过的电影信息,以及对电影的评分。(少于300部则全部爬取,多于300部则仅爬取前300个)

用Scrapy框架写

items.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class MovieInfo(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #电影的电影名、导演、编剧、主演、类型、国家、上映日期、片长,电影评分,以及每个星级评分的百分比数据。
    name=scrapy.Field()
    director=scrapy.Field()
    adaptor=scrapy.Field()
    rolename=scrapy.Field()
    type=scrapy.Field()
    country=scrapy.Field()
    date=scrapy.Field()
    length=scrapy.Field()
    grade=scrapy.Field()
    startrate=scrapy.Field()
    url=scrapy.Field()
class MovieContent(scrapy.Item):
    title=scrapy.Field()
    people=scrapy.Field()
    grade=scrapy.Field()
class MovieSelf(scrapy.Item):
    user=scrapy.Field()
    movie=scrapy.Field()
    selfgrade=scrapy.Field()

douban.py文件

# -*- coding: utf-8 -*-
import scrapy
import json
from NewDouBan.items import MovieInfo,MovieContent,MovieSelf
from urllib import urlencode
import time
class NewDouBan(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['douban.com']
    data = {
        'type': 'movie',
        'tag': '华语',
        'sort': 'rank',
        'page_limit': 20,
        'page_start': 0
    }
    start_urls = ['https://movie.douban.com/j/search_subjects?' + urlencode(data)]


    def parse(self, response):
        data = json.loads(response.text)["subjects"]
        for each in data:
            url2=each['url']
            yield scrapy.Request(url2, callback=self.parse_item)

 # 处理每个帖子里
    def parse_item(self, response):

        item = MovieInfo()

        # 标题
        item['name'] =response.xpath("//h1/span[@property]/text()").extract()[0]
        item['rolename']=response.xpath("//div[@id='info']//span[@class]/a[@rel='v:starring']/text()").extract()
        # 编号 
        item['director'] = response.xpath("//div[@id='info']//span[@class='attrs']/a[@rel]/text()").extract()[0]
        item['type']=response.xpath("//div[@id='info']//span[@property='v:genre']/text()").extract()
        # country=response.xpath("//div[@id='info']").extract()
        #for coun in country:
        # item['country']=country
        # print "-----------"
        # print type(country)
        # print len(country)
        item['date']=response.xpath("//div[@id='info']//span[@property='v:initialReleaseDate']/text()").extract()
        item['length']=response.xpath("//div[@id='info']//span[@property='v:runtime']/text()").extract()
        item['grade']=response.xpath("//strong/text()").extract()
        start=response.xpath("//div[@class='ratings-on-weight']/div/span/text()").extract()
        list=[]
        for it in start:
            list.append(it.replace("\n", "").replace(" ",""))
        item['startrate']=list
        # 链接
        item['url'] = response.url
        yield scrapy.Request(item['url']+"comments?status=P", callback=self.parse_item2)
        #print item
        ##yield item

    def parse_item2(self, response):
        item2 = MovieContent()
        url3 =response.xpath("//div[@class='comment-item']//div[@class='avatar']/a/@href").extract()
        eachs=response.xpath("//div[@id='comments']")
        for p in eachs:
            item2['title'] = p.xpath("//title/text()").extract()[0]
            item2['people']=p.xpath(".//span[@class='comment-info']//a/text()").extract()[0]
            item2['grade']=p.xpath(".//span[@class='comment-info']//span[2]/@title").extract()[0]

            if item2['grade'].find("-")>0:
                item2['grade']='nothing'
            ##yield item2
        url5 = response.xpath("//div[@id='comments']//div[@class='avatar']/a/@href").extract()
        u=str(url5)
        urlurl=u.split(",")
        for uuu in urlurl:
            uuu=uuu.replace("['",'')
            uuu=uuu.replace("]", '')
            uuu=uuu.replace("\'", '')
            uuu = uuu.replace(" ", '')
            uuu = uuu[1:]
            print "hhhhhhhhhhhhhhhhhhhhh"
            print uuu
            yield scrapy.Request(uuu, callback=self.parse_item3)
        # "//div[@id='comments']//div[@class='avatar']/a/@href"

        #yield scrapy.Request(url3, callback=self.parse_item3)
        # for url4 in url3:
        #     yield scrapy.Request(url4, callback=self.parse_item3)
        #
    def parse_item3(self,response):
        print "3333333333333333333"
        item3=MovieSelf()
        item3['user']=response.xpath("//title/text()").extract()
        item3['selfgrade'] = 'nothing'#response.xpath("//div[@id='comments']//span[@class='comment-info']")
        item3['movie'] = response.xpath("//div[@id='movie']//div[@class='obssin'][2]/ul/li/a/@title").extract()
        yield item3

管道文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json
class NewdoubanPipeline(object):

    count = 0
    c2=c=0
    def __init__(self):
        self.filename = open("douban.json", "w")
        self.filename2 = open("doubancomment.json", "w")
        self.filename3 = open("doubankanguo.json", "w")
        # import codecs
        # self.filename=codecs.open("dongguan.json","w",encoding="utf-8")

    def process_item(self, item, spider):
        item2=dict(item)
        if item2.has_key('people'):
            self.count += 1
            text = str(self.count) + json.dumps(dict(item), ensure_ascii=False) + '\n'
            self.filename.write(text.encode("utf-8"))
        elif item2.has_key('type'):
            self.c2 += 1
            text = str(self.c2) + json.dumps(dict(item), ensure_ascii=False) + '\n'
            self.filename2.write(text.encode("utf-8"))
        else:
            self.c+= 1
            text = str(self.c) + json.dumps(dict(item), ensure_ascii=False) + '\n'
            self.filename3.write(text.encode("utf-8"))
        return item

    def close_spider(self, spider):
        self.filename.close()
        self.filename2.close()

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值