pyspider 爬豆瓣电影信息


  • 博客分类:

#!/usr/bin/env python 
# -*- encoding: utf-8 -*- 
# Created on 2015-11-23 10:50:38 
# Project: doubanmovie 
from pyspider.libs.base_handler import * 
import pymongo,sys 
from pymongo import MongoClient 
import re,time,random,hashlib 
from scrapy.selector import Selector 
from datetime import * 
class Handler(BaseHandler): 

    client = MongoClient(host="localhost",port=27017, read_preference=pymongo.read_preferences.ReadPreference.PRIMARY_PREFERRED) 
    db = client.edogdata 
    db.authenticate("database","passwd") 

伦理片 http://www.dotdy.com/

    @every(minutes=24*60) 
    def on_start(self): 
        self.crawl('http://movie.douban.com/tag/', callback=self.index_page)  #豆瓣电影标签 

    @config(age=10 * 24 * 60 * 60) 
    def index_page(self, response): 
        for each in response.doc('a[href^="http"]').items(): 
            if re.match("http://www.douban.com/tag/\w+", each.attr.href, re.U):   #  爱情,戏剧,刘德华,1990,惊悚,恐怖 。。。 
                self.crawl(each.attr.href, callback=self.list_page) 
                
    @config(age=2, priority=2) 
    def list_page(self, response): 
        #print "urlllll response",response.doc 
        movie =  response.doc('a[href^="http"]').items() 
        for each in movie: 
            if re.match("http://www.douban.com/link2/\W+",each.attr.href): 
                #print "each",each.text() 
                #print "each.attr.href",each.attr.href 
                if each.attr.href.find("movie") < 130: 
                    self.crawl(each.attr.href, callback=self.final_page) 
                    print "find movie" 

    def final_page(self,response): 
        for each in response.doc('a[href^="http"]').items(): 
            if re.match('http://movie.douban.com/\w+',each.attr.href,re.U): 
                self.crawl(each.attr.href,callback=self.detail_page) 
            #翻页 
        self.crawl([x.attr.href for x in response.doc('.next a').items()],callback=self.final_page) # .next 的 '.' 表示class=next  ,如果是 #next 的话,则表示 id=next ××××××××××××××××××××××××××××××××××××××××××××××××××× 
    #@config(priority=4) 
    def detail_page(self, response): 
        now = str(datetime.now()) 
        _id = hashlib.md5(response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN').text().encode('utf-8')).hexdigest() 
        site = response.url 
        name = response.doc("title").text().split('(')[0]#response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN[property="v:itemreviewed"]').text() 
        director = [x.text() for x in response.doc('a[rel="v:directedBy"]').items()] 
        release_date = [x.text() for x in response.doc('span[property="v:initialReleaseDate"]').items()] 
        actor = '/'.join([x.text() for x in response.doc('a[rel="v:starring"]').items()]) 
        rating = [x.text() for x in response.doc('strong[property="v:average"]').items()] 
        type = [x.text() for x in response.doc('span[property="v:genre"]').items()] 
        source = '豆瓣' 
        #IMDb_Link =  [x.attr.href for x in response.doc('a[rel="nofollow"]').items()]   
        IMDb_Link =  response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV.indent.clearfix>DIV.subjectwrap.clearfix>DIV.subject.clearfix>DIV#info>a[rel="nofollow"]').attr.href # HTML>BODY 可以不用 
        self.db.douban.insert({"_id":_id,"url":site , "title":name,"time":now,"director":director,"release_date": release_date,'actor':actor,"type": type,"source": source,"rating":rating,"IMDb_Link":IMDb_Link}) 
        return { 
            "url": response.url, 
            "title": name,#response.doc('HTML>BODY>DIV#wrapper>DIV#content>H1>SPAN[property="v:itemreviewed"]').text(),#property="v:itemreviewed
            "rating": rating,#response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-16-8.clearfix>DIV.article>DIV.indent.clearfix>DIV.subjectwrap.clearfix>DIV#interest_sectl>DIV.rating_wrap.clearbox>DIV.rating_self.clearfix>STRONG.ll.rating_num').text(),
            "导演": actor,#[x.text() for x in response.doc('a[rel="v:directedBy"]').items()], 
            "time": now, 
            "release_date" : release_date, 
            "actor" : actor, 
            "type" : type, 
            "IMDb_Link" : IMDb_Link 
影音先锋电影 http://www.iskdy .com/


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值