>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
声明:仅学习参考
版本:verison_0
效果图:
![](https://i-blog.csdnimg.cn/blog_migrate/3438cfaa8af17c9c104c8aa9c51e9e2b.png)
说明:city_name需要手动输入,即当前城市的url需要手动构造
version_1待实现的功能:(1)通过输入城市名称,即可对应于对应url,而无需手动构造
(2)多线程(在循环电影数据的时候,可构造一个线程,通过for的方式,生成多线程,注意设置为守护线程,join)
流程:(1)输入城市名字(注意热门城市,是China_Beijing字样,非热门城市是China_Hubei_Province_Huanggang字样)
(2)获取当前目录下的所有上映的热门电影(如果不需要提取图片等信息,可直接抓取“<script type="text/javascript">var hotplaySvList”下的信息,包括id,url和title
(3)通过获取到的url和id构造详情页请求,获取主演,类型,描述,热门评论等信息
(4)票房或评分等信息在ajax请求(Movie.api....)中,通过id可构造相应的请求
注意点:(1)在每次请求前带上referer
(2)请求的频率不宜过快,且应该随机
(3)输入city_name后注意首次发起的请求是否成功,如果不成功,说明url没有构造好,应该在网站上找出规律,如.../China_Hubei_Province_Huanggang字样
源码:
from urllib import parse,error
import requests
from lxml import etree
import re
import json
from pprint import pprint
import time,random
class MtimeSpider(object):
def __init__(self):
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
def parse_ajax_content(self,response):
totalboxoffice = re.findall(r'"TotalBoxOffice":"(.*?)"',string=response,flags=re.S)
totalboxoffice = totalboxoffice[0] if totalboxoffice else None
totalboxofficeunit = re.findall(r'"TotalBoxOfficeUnit":"(.*?)"',string=response,flags=re.S) # 获取单位
totalboxofficeunit = totalboxofficeunit[0] if totalboxofficeunit else None
if totalboxoffice is not None and totalboxofficeunit is not None:
totalboxoffice = totalboxoffice+totalboxofficeunit
ratingfinall = re.findall(r'"RatingFinal":(.*?),',string=response,flags=re.S)
ratingfinall = ratingfinall[0] if ratingfinall else None
return totalboxoffice,ratingfinall
def parse_movie_detail(self,response):
html = etree.HTML(response)
# 获取分类和上映日期
cate = str()
category = html.xpath("//div[contains(@class,'db_head')]/div[last()]/a/text()")
for str_ in category[0:-1]:
cate += "/"+str_
showdata = category[-1]
# 获取描述
desc = html.xpath('//p[contains(@class,"mt6 lh18")]/text()')
description = desc[0] if desc else None
# 获取主演
main_actor= html.xpath("//dl[contains(@class,'main_actor')]/dd/a/img/@alt")
return cate,showdata,description,main_actor
def parse_hotplay(self,hot_play_list,referer):
items_list = list()
for hot_play in hot_play_list:
item = dict()
item['title'] = hot_play['Title']
item['href'] = hot_play['Url']
# 获取到单个电影的分类,描述,主演,总票房和评分
## 获取电影详情页的响应
### 获取分类,上映时间,描述,主演
movie_html_str = self.get_request(url=hot_play['Url'],referer=referer)
category,showdata,description,main_actor = self.parse_movie_detail(movie_html_str)
item["category"] = category
item['showdata'] = showdata
item['description'] = description
item['main_actor'] = main_actor
items_list.append(item)
# 获取总票房和评分
ajax_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CallBackArgument0={}".format(hot_play['Id'])
ajax_content = self.get_request(url=ajax_url,referer=hot_play['Url'])
totalboxoffice,ratingfinall = self.parse_ajax_content(response=ajax_content)
item['totalboxoffice'] = totalboxoffice
item['ratingfinall'] = ratingfinall
time.sleep(random.random())
return items_list
def parse_city_request(self,response):
useful_info = re.findall(pattern=r'hotplaySvList\s+=\s+(.*?);',string=response,flags=re.S)
useful_info = useful_info[0] if useful_info else None
return json.loads(s=useful_info,encoding='utf-8')
def get_request(self,url,referer=None):
headers = {
"User-Agent":self.user_agent,
"Referer":referer,
}
try:
response = requests.get(url=url,headers=headers)
if response.status_code == 200 and response.text is not None:
return response.content.decode()
except error.HTTPError as e:
print(e)
def run(self,city_name):
# 获取当前城市正在热映的电影
url = "http://theater.mtime.com/China_{}/".format(city_name)
city_html_str = self.get_request(url,referer='http://www.mtime.com/')
hot_play_list = self.parse_city_request(city_html_str) # 拿到当前城市的所有热映电影,返回列表
# 进入电影详情页,获取电影标题,分类,描述,主演,总票房
items_list = self.parse_hotplay(hot_play_list,referer=url)
pprint(items_list)
if __name__=="__main__":
# city_name = input("请输入城市(拼音形式,首字母大写):")
city_name = "Huangshi"
obj = MtimeSpider()
obj.run(city_name=city_name)
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<