动态网站爬虫框架

最新推荐文章于 2024-04-28 11:28:16 发布

DMU_lzq1996

最新推荐文章于 2024-04-28 11:28:16 发布

阅读量317

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/DMU_lzq1996/article/details/84423101

版权

爬虫专栏收录该内容

17 篇文章 0 订阅

订阅专栏

下载器

import requests

class HtmlDownloader(object):
    def download(self,url):
        if url is None:
            return None
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'
        headers = {'User-Agent':user_agent}
        r = requests.get(url,headers=headers)
        if r.status_code ==200:
            r.encoding = 'utf-8'
            return r.text
        return None

解析器

import re
import json

class HtmlParser(object):
    
    def parser_url(self,page_url,response):
        pattern = re.compile(r'(http://movie.mtime.com/(\d+)/)')
        urls = pattern.findall(response)
        if urls != None:
            return list(set(urls))#去重
        else:
            return None
    
    def parser_json(self,page_url,response):
        #解析响应
        #将=和；之间的内容提取出来
        pattern = re.compile(r'=(.*?);') 
        result = pattern.findall(response)[0]
        if result != None:
            #json模块加载字符串
            value = json.loads(result)
            try:
                isRelease = value.get('value').get('isRealease')
            except Exception as e:
                print(e)
                return None
            if isRelease:
                if value.get('value').get('hotValue') == None:
                    return self._parser_release(page_url,value)
                else:
                    return self._parser_no_release(page_url,value,isRelease=2)
            else:
                return self._parser_no_release(page_url,value)
            
    def _parser_release(self,page_url,value):
            #解析已经上映的影片
            try:
                isRelease = 1
                movieRating = value.get('value').get('movieRating')
                boxOffice = value.get('value').get('boxOffice')
                movieTitle = value.get('value').get('movieTitle')
                RPictureFinal = movieRating.get('RPictureFinal')
                RStoryFinal = movieRating.get('RStoryFinal')
                RDirectorFinal = movieRating.get('RDirectorFinal')
                ROtherFinal = movieRating.get('ROtherFinal')
                RatingFinal = movieRating.get('RatingFinal')
                
                MovieId = movieRating.get('MovieId')
                Usercount = movieRating.get('Usercount')
                AttitudeCount = movieRating.get('AttitudeCount')
                
                TotalBoxOffice = boxOffice.get('TotalBoxOffice')
                TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeOfficeUnit')
                TodayBoxOffice = boxOffice.get('TodayBoxOffice')
                TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')
                
                ShowDays = boxOffice.get('ShowDays')
                try:
                    Rank = boxOffice.get('Rank')
                except Exception:
                    Rank = 0
                #返回所提取的内容
                return(MovieId,movieTitle,RatingFinal,
                       ROtherFinal,RPictureFinal,RDirectorFinal,
                       RStoryFinal,Usercount,AttitudeCount,
                       TotalBoxOffice+TotalBoxOfficeUnit,
                       TodayBoxOffice+TodayBoxOfficeUnit,
                       Rank,ShowDays,isRelease)
            except Exception as e:
                print(e,page_url,value)
                return None
            
    def _parser_no_release(self,page_url,value,isRelease = 0):
        #解析未上映的电影
        try:
            movieRating = value.get('value').get('movieRating')
            movieTitle = value.get('value').get('movieTitle')
            
            RPictureFinal = movieRating.get('RPictureFinal')
            RStoryFinal = movieRating.get('RStoryFinal')
            RDirectorFinal = movieRating.get('RDirectorFinal')
            ROtherFinal = movieRating.get('ROtherFinal')
            RatingFinal = movieRating.get('RatingFinal')
                
            MovieId = movieRating.get('MovieId')
            Usercount = movieRating.get('Usercount')
            AttitudeCount = movieRating.get('AttitudeCount')
            try:
                Rank = value.get('value').get('hotValue').get('Ranking')
            except Exception :
                Rank = 0
            return (MovieId,movieTitle,RatingFinal,
                    ROtherFinal,RPictureFinal,RDirectorFinal,
                    RStoryFinal,Usercount,AttitudeCount,u'无',
                    u'无',Rank,0,isRelease)
        except Exception as e:
            print(e,page_url,value)
            return None

爬虫调度器：

import HtmlDownloader
import HtmlParser
import time

class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader.HtmlDownloader()
        self.parser = HtmlParser.HtmlParser()
    def crawl(self,root_url):
        content = self.downloader.download(root_url) 
        urls = self.parser.parser_url(root_url,content)
        #构造一个获取评分和票房连接
        for url in urls:
            try:
                t = time.strtime("%Y%m%H%M%S3313",time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s'%(url[0],t,url[1])
                #print(rank_url)
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url,rank_content)
                print(data)
            except Exception:
                print('Crawl failed')

if __name__ == '__main__':
    spider = SpiderMan()
    spider.crawl('http://theater.mtime.com/China_Beijing/')
        
#var result_2018112321354583313 = { "value":{"isRelease":true,"movieRating":{"MovieId":217130,"RatingFinal":7.9,"RDirectorFinal":8.2,"ROtherFinal":7.3,"RPictureFinal":8.5,"RShowFinal":0,"RStoryFinal":7.7,"RTotalFinal":0,"Usercount":4754,"AttitudeCount":2720,"UserId":0,"EnterTime":0,"JustTotal":0,"RatingCount":0,"TitleCn":"","TitleEn":"","Year":"","IP":0},"movieTitle":"比利·林恩的中场战事","tweetId":0,"userLastComment":"","userLastCommentUrl":"","releaseType":3,"boxOffice":{"Rank":0,"TotalBoxOffice":"1.65","TotalBoxOfficeUnit":"亿","TodayBoxOffice":"0.0","TodayBoxOfficeUnit":"万","ShowDays":0,"EndDate":"2017-11-10 05:15","FirstDayBoxOffice":"2543.81","FirstDayBoxOfficeUnit":"万"}},"error":null};var movieOverviewRatingResult=result_2018112321354583313;
"""http://service.library.mtime.com/Movie.api
?Ajax_CallBack=true
&Ajax_CallBackType=Mtime.Library.Services
&Ajax_CallBackMethod=GetMovieOverviewRating
&Ajax_CrossDomain=1
&Ajax_RequestUrl=http://movie.mtime.com/217130/
&t=2018112321354583313
&Ajax_CallBackArgument0=217130 
"""

爬取结果：
在这里插入图片描述

DMU_lzq1996

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
动态网站爬虫框架

下载器import requestsclass HtmlDownloader(object): def download(self,url): if url is None: return None user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) ...
复制链接

扫一扫

专栏目录