下载器
import requests
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'
headers = {'User-Agent':user_agent}
r = requests.get(url,headers=headers)
if r.status_code ==200:
r.encoding = 'utf-8'
return r.text
return None
解析器
import re
import json
class HtmlParser(object):
def parser_url(self,page_url,response):
pattern = re.compile(r'(http://movie.mtime.com/(\d+)/)')
urls = pattern.findall(response)
if urls != None:
return list(set(urls))#去重
else:
return None
def parser_json(self,page_url,response):
#解析响应
#将=和;之间的内容提取出来
pattern = re.compile(r'=(.*?);')
result = pattern.findall(response)[0]
if result != None:
#json模块加载字符串
value = json.loads(result)
try:
isRelease = value.get('value').get('isRealease')
except Exception as e:
print(e)
return None
if isRelease:
if value.get('value').get('hotValue') == None:
return self._parser_release(page_url,value)
else:
return self._parser_no_release(page_url,value,isRelease=2)
else:
return self._parser_no_release(page_url,value)
def _parser_release(self,page_url,value):
#解析已经上映的影片
try:
isRelease = 1
movieRating = value.get('value').get('movieRating')
boxOffice = value.get('value').get('boxOffice')
movieTitle = value.get('value').get('movieTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal')
ROtherFinal = movieRating.get('ROtherFinal')
RatingFinal = movieRating.get('RatingFinal')
MovieId = movieRating.get('MovieId')
Usercount = movieRating.get('Usercount')
AttitudeCount = movieRating.get('AttitudeCount')
TotalBoxOffice = boxOffice.get('TotalBoxOffice')
TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeOfficeUnit')
TodayBoxOffice = boxOffice.get('TodayBoxOffice')
TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')
ShowDays = boxOffice.get('ShowDays')
try:
Rank = boxOffice.get('Rank')
except Exception:
Rank = 0
#返回所提取的内容
return(MovieId,movieTitle,RatingFinal,
ROtherFinal,RPictureFinal,RDirectorFinal,
RStoryFinal,Usercount,AttitudeCount,
TotalBoxOffice+TotalBoxOfficeUnit,
TodayBoxOffice+TodayBoxOfficeUnit,
Rank,ShowDays,isRelease)
except Exception as e:
print(e,page_url,value)
return None
def _parser_no_release(self,page_url,value,isRelease = 0):
#解析未上映的电影
try:
movieRating = value.get('value').get('movieRating')
movieTitle = value.get('value').get('movieTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal')
ROtherFinal = movieRating.get('ROtherFinal')
RatingFinal = movieRating.get('RatingFinal')
MovieId = movieRating.get('MovieId')
Usercount = movieRating.get('Usercount')
AttitudeCount = movieRating.get('AttitudeCount')
try:
Rank = value.get('value').get('hotValue').get('Ranking')
except Exception :
Rank = 0
return (MovieId,movieTitle,RatingFinal,
ROtherFinal,RPictureFinal,RDirectorFinal,
RStoryFinal,Usercount,AttitudeCount,u'无',
u'无',Rank,0,isRelease)
except Exception as e:
print(e,page_url,value)
return None
爬虫调度器:
import HtmlDownloader
import HtmlParser
import time
class SpiderMan(object):
def __init__(self):
self.downloader = HtmlDownloader.HtmlDownloader()
self.parser = HtmlParser.HtmlParser()
def crawl(self,root_url):
content = self.downloader.download(root_url)
urls = self.parser.parser_url(root_url,content)
#构造一个获取评分和票房连接
for url in urls:
try:
t = time.strtime("%Y%m%H%M%S3313",time.localtime())
rank_url = 'http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s'%(url[0],t,url[1])
#print(rank_url)
rank_content = self.downloader.download(rank_url)
data = self.parser.parser_json(rank_url,rank_content)
print(data)
except Exception:
print('Crawl failed')
if __name__ == '__main__':
spider = SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')
#var result_2018112321354583313 = { "value":{"isRelease":true,"movieRating":{"MovieId":217130,"RatingFinal":7.9,"RDirectorFinal":8.2,"ROtherFinal":7.3,"RPictureFinal":8.5,"RShowFinal":0,"RStoryFinal":7.7,"RTotalFinal":0,"Usercount":4754,"AttitudeCount":2720,"UserId":0,"EnterTime":0,"JustTotal":0,"RatingCount":0,"TitleCn":"","TitleEn":"","Year":"","IP":0},"movieTitle":"比利·林恩的中场战事","tweetId":0,"userLastComment":"","userLastCommentUrl":"","releaseType":3,"boxOffice":{"Rank":0,"TotalBoxOffice":"1.65","TotalBoxOfficeUnit":"亿","TodayBoxOffice":"0.0","TodayBoxOfficeUnit":"万","ShowDays":0,"EndDate":"2017-11-10 05:15","FirstDayBoxOffice":"2543.81","FirstDayBoxOfficeUnit":"万"}},"error":null};var movieOverviewRatingResult=result_2018112321354583313;
"""http://service.library.mtime.com/Movie.api
?Ajax_CallBack=true
&Ajax_CallBackType=Mtime.Library.Services
&Ajax_CallBackMethod=GetMovieOverviewRating
&Ajax_CrossDomain=1
&Ajax_RequestUrl=http://movie.mtime.com/217130/
&t=2018112321354583313
&Ajax_CallBackArgument0=217130
"""
爬取结果: