爬取影评信息

39 篇文章 3 订阅

网页下载器

import requests
from http import cookiejar
import urllib

class HtmlDownloader():
    def cookie():
        with open('cookie.txt','r') as f:
            cookies={}
            for line in f.read().split(';'):
                name,value=line.strip().split('=',1)
                cookies[name]=value 
            return cookies
        
    def download(self,url):
        if url is None:
            return None
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
            'Referer': r'http://movie.mtime.com',
            'Connection': 'keep-alive'
        }
        cookie=cookiejar.CookieJar()
        opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
        response=opener.open(url)
        r=requests.get(url,headers=headers,cookies=cookie)
        if r.status_code==200:
            r.encoding='utf-8'
            return r.text
        return None

网页解析器

import re
import json

class HtmlParser():
    def parser_url(self,page_url,response):
        pattern=re.compile(r'(http://movie.mtime.com/(\d+)/)')
        urls=pattern.findall(response)
        if urls!=None:
            return list(set(urls))
        else:
            return '没有链接了!'

    def parser_json(self,page_url,response):
        pattern=re.compile(r'=(.*?);')
        result=pattern.findall(response)[0]
        if result!=None:
            value=json.loads(result)
            try:
                isRelease=value.get('value')
            except Exception as e:
                print(e)
                return None
            if isRelease:
                if value.get('value').get('hotValue')==None:
                    return self._parser_release(page_url,value)
                else:
                    return self._parser_no_release(page_url,value,isRelease=2)
            else:
                return self._parser_no_release(page_url,value)

    def _parser_release(self,page_url,value):
        try:
            isRelease=1
            movieRating=value.get('value').get('movieRating')
            boxOffice=value.get('value').get('boxOffice')
            movieTitle=value.get('value').get('movieTitle')
            RPictureFinal=movieRating.get('RPictureFinal')
            RStoryFinal=movieRating.get('RStoryFinal')
            RDirectorFinal=movieRating.get('RDirectorFinal')
            ROtherFinal=movieRating.get('ROtherFinal')
            RatingFinal=movieRating.get('RatingFinal')

            MovieId=movieRating.get('MovieId')
            Usercount=movieRating.get('Usercount')
            AttitudeCount=movieRating.get('AttitudeCount')

            if boxOffice!=None:
                TotalBoxOffice=boxOffice.get('TotalBoxOffice')
                TotalBoxOfficeUnit=boxOffice.get('TotalBoxOfficeUnit')
                TodayBoxOffice=boxOffice.get('TodayBoxOffice')
                TodayBoxOfficeUnit=boxOffice.get('TodayBoxOfficeUnit')

                ShowDays=boxOffice.get('ShowDays')
                try:
                    Rank=boxOffice.get('Rank')
                except Exception as e:
                    Rank=0
                return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,
                       TotalBoxOffice+TotalBoxOfficeUnit,TodayBoxOffice+TodayBoxOfficeUnit,Rank,ShowDays,isRelease)
            
            else:
                Rank=0
                return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
        except Exception as e:
            print(e,page_url,value)
            return None

    def _parser_no_release(self,page_url,value,isRelease=0):
        try:
            movieRating=value.get('value').get('movieRating')
            movieTitle=value.get('value').get('movieTitle')

            RPictureFinal=movieRating.get('RPictureFinal')
            RStoryFinal=movieRating.get('RStoryFinal')
            RDirectorFinal=movieRating.get('RDirectorFinal')
            ROtherFinal=movieRating.get('ROtherFinal')
            RatingFinal=movieRating.get('RatingFinal')
            MovieId=movieRating.get('MovieId')
            Usercount=movieRating.get('Usercount')
            AttitudeCount=movieRating.get('AttitudeCount')
            try:
                Rank=value.get('value').get('hotValue').get('Ranking')
            except Exception as e:
                Rank=0
            return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
        except Exception as e:
            print(e,page_url,value)
            return None

##数据存储器

import sqlite3

class DataOutput():
    def __init__(self):
        self.cx=sqlite3.connect('/home/as/test.db')
        self.create_table('MTime')
        self.datas=[]
        
    def create_table(self,table_name):
        values='''
            id integer primary key,
            MovieId integer,
            MovieTitle varchar(40) NOT NULL,
            RatingFinal REAL NOT NULL DEFAULT 0.0,
            ROtherFinal REAL NOT NULL DEFAULT 0.0,
            RPictureFinal REAL NOT NULL DEFAULT 0.0,
            RDirectorFinal REAL NOT NULL DEFAULT 0.0,
            RStoryFinal REAL NOT NULL DEFAULT 0.0,
            Usercount integer NOT NULL DEFAULT 0,
            AttitudeCount integer NOT NULL DEFAULT 0,
            TotalBoxOffice varchat(20) NOT NULL,
            TodayBoxOffice varchat(20) NOT NULL,
            Rank integer NOT NULL DEFAULT 0,
            ShowDays integer NOT NULL DEFAULT 0,
            isRelease integer NOT NULL
            '''
        self.cx.execute('CREATE TABLE IF NOT EXISTS %s ( %s ) '%(table_name, values))
        
    def store_data(self,data):
        if data is None:
            return
        self.datas.append(data)
        if len(self.datas)>10:
            self.output_db('MTime')
        
    def output_db(self,table_name):
        for data in self.datas:
            self.cx.execute('INSERT INTO %s (MovieId,MovieTitle,RatingFinal,'
                           'ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,'
                           'Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,'
                           'Rank,ShowDays,isRelease) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
                          ''%table_name,data)
            self.datas.remove(data)
        self.cx.commit()
        
    def output_end(self):
        if len(self.datas)>0:
            self.output_db('MTime')
        self.cx.close()

##爬虫调度器

import time

class SpiderMan():
    def __init__(self):
        self.downloader=HtmlDownloader()
        self.parser=HtmlParser()
        self.output=DataOutput()
    def crawl(self,root_url):
        content=self.downloader.download(root_url)
        urls=self.parser.parser_url(root_url,content)
        for url in urls:
            try:
                t=time.strftime('%Y%-m%-d%H%M%S3282',time.localtime())
                rank_url='http://service.library.mtime.com/Movie.api'\
                '?Ajax_CallBack=true'\
                '&Ajax_CallBackType=Mtime.Library.Services'\
                '&Ajax_CallBackMethod=GetMovieOverviewRating'\
                '&Ajax_CrossDomain=1'\
                '&Ajax_RequestUrl=%s'\
                '&t=%s'\
                '&Ajax_CallBackArgument0=%s'%(url[0],t,url[1])
                rank_content=self.downloader.download(rank_url)
                data=self.parser.parser_json(rank_url,rank_content)
                self.output.store_data(data)
            except Exception as e:
                print(e)
spider=SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')

更多爬虫实例请见 https://blog.csdn.net/weixin_39777626/article/details/81564819

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值