猫眼电影专业版数据爬取
最近学习了一点简单的爬虫,利用山海鲸APP做了一个简单的可视化界面,接下来把我的内容分享给大家。
1.导入包
以下是这次爬取数据需要用到的包,当然也可以使用Xpath、或者是beautifulsoup凭个人喜好来。
import requests
import csv
from fake_useragent import UserAgent # ⽣成随机User-Agent以避免被识别为⾃动化脚本
import json
2.关于函数
这了存储数据的格式类型为.csv
然后因为我代码没进行优化重复性很高,因此,封装了一个存储数据的函数如下。
def append_data_to_csv(data,path):
# 写入CSV文件
with open(path, "a", newline="", encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=data[0].keys())
# 如果文件为空,写入表头
if csv_file.tell() == 0:
writer.writeheader()
# 写入新数据
writer.writerows(data)
3.访问猫眼电影专业版
这里访问的是动态网页通过Network中的Fetch/XHR中找到的URL。如下图所示。
代码如下:
# 获取电影票房信息Comprehensive box office
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}
url = 'https://piaofang.maoyan.com/dashboard-ajax?orderType=0&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717473044447&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=647&channelId=40009&sVersion=2&signKey=5de146e3635549eaa1a765c2ac929c97'
response = requests.get(url,headers=headers)
if response.ok:
print(response.status_code)
else:
print(response.status_code, "请求失败")
4.获取电影票房信息
# 获取电影票房信息Comprehensive box office
html = response.text
data = json.loads(html)
data = data["movieList"]["data"]["list"]
data数据样式如下:
[{'avgSeatView': '2.9%',
'avgShowView': '4.2',
'boxRate': '53.3%',
'boxSplitUnit': {'num': '.',
'unit': '万'},
'movieInfo': {'movieId': 1371016,
'movieName': '云边有个小卖部',
'releaseInfo': '上映4天'},
'showCount': 115665,
'showCountRate': '30.9%',
'splitBoxRate': '53.3%',
'splitBoxSplitUnit': {'num': '.',
'unit': '万'},
'sumBoxDesc': '1.97亿',
'sumSplitBoxDesc': '1.78亿'},
{'avgSeatView': '1.7%',
'avgShowView': '2.3',
'boxRate': '15.0%',
'boxSplitUnit': {'num': '.',
'unit': '万'},
'movieInfo': {'movieId': 1479534,
'movieName': '头脑特工队2',
'releaseInfo': '上映5天'},
'showCount': 57860,
'showCountRate': '15.4%',
'splitBoxRate': '14.9%',
'splitBoxSplitUnit': {'num': '.',
'unit': '万'},
'sumBoxDesc': '8864.1万',
'sumSplitBoxDesc': '7992.7万'},
…………………………
{'avgSeatView': '0.3%',
'avgShowView': '0.2',
'boxRate': '<0.1%',
'boxSplitUnit': {'num': '.', 'unit': '万'},
'movieInfo': {'movieId': 1469891,
'movieName': '阿搭嫂(戏曲 高甲戏)',
'releaseInfo': '上映首日'},
'showCount': 6,
'showCountRate': '<0.1%',
'splitBoxRate': '<0.1%',
'splitBoxSplitUnit': {'num': '.', 'unit': '万'},
'sumBoxDesc': '33',
'sumSplitBoxDesc': '30'}]
5.获取自己想要的电影信息
代码如下:
for cbo_data in data:
# 上座率数据缺省值这么使用数据
data_avgSeatView = cbo_data['avgSeatView']
# print(data_avgSeatView)
# 场均人次
data_avgShowView=cbo_data['avgShowView']
# 票房占比
data_boxRate=cbo_data['boxRate']
# 电影名称
data_name=cbo_data['movieInfo']['movieName']
# 上映时间
data_time=cbo_data['movieInfo']['releaseInfo']
# 综合票房
data_sumBoxDesc=cbo_data['sumBoxDesc']
# 排片场次
data_showCount=cbo_data['showCount']
# 排片占比
data_showCountRate=cbo_data['showCountRate']
data_res=[{'电影名称':data_name,
'上映时间':data_time,
'上座率':data_avgSeatView,
'场均人次':data_avgShowView,
'票房占比':data_boxRate,
'综合票房':data_sumBoxDesc,
'排片场次':data_showCount,
'排片占比':data_showCountRate
}]
path = "../data/A猫眼电影综合票房.csv"
append_data_to_csv(data_res,path)
print(data_res[0]["电影名称"],"成功写入!")
全部代码
上面以获取电影信息为例展示代码,下面是获取该网页下的不同信息的代码,原理与上面一致,重复率很高,基本上就是修改url以及变量名。
#导入包
import requests
import csv
from fake_useragent import UserAgent # ⽣成随机User-Agent以避免被识别为⾃动化脚本
import json
def append_data_to_csv(data,path):
# 写入CSV文件
with open(path, "a", newline="", encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=data[0].keys())
# 如果文件为空,写入表头
if csv_file.tell() == 0:
writer.writeheader()
# 写入新数据
writer.writerows(data)
# 获取电影票房信息Comprehensive box office
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}
url = 'https://piaofang.maoyan.com/dashboard-ajax?orderType=0&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717473044447&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=647&channelId=40009&sVersion=2&signKey=5de146e3635549eaa1a765c2ac929c97'
response = requests.get(url,headers=headers)
if response.ok:
print(response.status_code)
else:
print(response.status_code, "请求失败")
# 获取电影票房信息Comprehensive box office
html = response.text
data = json.loads(html)
data = data["movieList"]["data"]["list"]
for cbo_data in data:
# 上座率数据缺省值这么使用数据
data_avgSeatView = cbo_data['avgSeatView']
# print(data_avgSeatView)
# 场均人次
data_avgShowView=cbo_data['avgShowView']
# 票房占比
data_boxRate=cbo_data['boxRate']
# 电影名称
data_name=cbo_data['movieInfo']['movieName']
# 上映时间
data_time=cbo_data['movieInfo']['releaseInfo']
# 综合票房
data_sumBoxDesc=cbo_data['sumBoxDesc']
# 排片场次
data_showCount=cbo_data['showCount']
# 排片占比
data_showCountRate=cbo_data['showCountRate']
data_res=[{'电影名称':data_name,
'上映时间':data_time,
'上座率':data_avgSeatView,
'场均人次':data_avgShowView,
'票房占比':data_boxRate,
'综合票房':data_sumBoxDesc,
'排片场次':data_showCount,
'排片占比':data_showCountRate
}]
path = "../data/A猫眼电影综合票房.csv"
append_data_to_csv(data_res,path)
print(data_res[0]["电影名称"],"成功写入!")
# 获取网播热度Network broadcast heat
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}
url = 'https://piaofang.maoyan.com/dashboard/webHeatData?showDate=20240604&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717472862330&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=213&channelId=40009&sVersion=2&signKey=cb852978f92bcd860cce84b94f6df03a'
response = requests.get(url,headers=headers)
if response.ok:
print(response.status_code)
else:
print(response.status_code, "请求失败")
# 获取网播热度Network broadcast heat
html = response.text
data = json.loads(html)
data = data['dataList']['list']
for nbh_data in data:
data_currHeatDesc = nbh_data['currHeatDesc'] # 实时热度
data_name = nbh_data['seriesInfo']['name'] # 剧名
data_platformDesc = nbh_data['seriesInfo']['platformDesc'] # 播放资源平台
data_releaseInfo = nbh_data['seriesInfo']['releaseInfo'] # 上线天数
data_playCountSplitUnit = str(nbh_data.get('playCountSplitUnit', {}).get('num', '')) + nbh_data.get('playCountSplitUnit', {}).get('unit', '') # 实时播放量
data_res=[{'剧名':data_name,
'播放资源平台':data_platformDesc,
'上线天数':data_releaseInfo,
'实时播放量':data_playCountSplitUnit,
'实时热度':data_currHeatDesc
}]
path = "../data/B猫眼电影网播热度.csv"
append_data_to_csv(data_res,path)
print(data_res[0]["剧名"],"成功写入!")
# 获取电视收视TV ratings
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}
url = 'https://piaofang.maoyan.com/dashboard/getTVData?startTime=00:00&endTime=24:00&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717470894813&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=624&channelId=40009&sVersion=2&signKey=40202fce4a917830ac28191498a1a45f'
response = requests.get(url,headers=headers)
if response.ok:
print(response.status_code)
else:
print(response.status_code, "请求失败")
# 获取电视收视TV ratings
html = response.text
data = json.loads(html)
data = data['tvListAll']['data']['list']
for TVratings in data:
# 实时收视率
data_attentionRateDesc = TVratings['attentionRateDesc']
# 市占率
data_marketRateDesc = TVratings['marketRateDesc']
# 剧名
data_programmeName = TVratings['programmeName']
data_res=[{'剧名':data_programmeName,
'实时收视率':data_attentionRateDesc,
'市占率':data_marketRateDesc
}]
path = "../data/C猫眼电影电视收视.csv"
append_data_to_csv(data_res,path)
print(data_res[0]["剧名"],"成功写入!")
# 获取电视剧热度
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}
url = 'https://piaofang.maoyan.com/dashboard/webHeatData?showDate=20240605&seriesType=0&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717550798127&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=235&channelId=40009&sVersion=2&signKey=da4746ebaeaa2911ff2bc3cd85561d2a'
response = requests.get(url,headers=headers)
if response.ok:
print(response.status_code)
else:
print(response.status_code, "请求失败")
# 获取电视剧热度
html = response.text
data = json.loads(html)
data = data['dataList']['list']
for nbh_data in data:
data_currHeatDesc = nbh_data['currHeatDesc'] # 实时热度
data_name = nbh_data['seriesInfo']['name'] # 剧名
data_platformDesc = nbh_data['seriesInfo']['platformDesc'] # 播放资源平台
data_releaseInfo = nbh_data['seriesInfo']['releaseInfo'] # 上线天数
data_playCountSplitUnit = str(nbh_data.get('playCountSplitUnit', {}).get('num', '')) + nbh_data.get('playCountSplitUnit', {}).get('unit', '') # 实时播放量
data_res=[{'剧名':data_name,
'播放资源平台':data_platformDesc,
'上线天数':data_releaseInfo,
'实时播放量':data_playCountSplitUnit,
'实时热度':data_currHeatDesc
}]
path = "../data/D猫眼电影电视剧热度.csv"
append_data_to_csv(data_res,path)
print(data_res[0]["剧名"],"成功写入!")
# 获取网络剧热度
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}
url = 'https://piaofang.maoyan.com/dashboard/webHeatData?showDate=20240605&seriesType=1&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717551331397&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=127&channelId=40009&sVersion=2&signKey=57bf729c276d4207002252c4869f6e15'
response = requests.get(url,headers=headers)
if response.ok:
print(response.status_code)
else:
print(response.status_code, "请求失败")
# 获取网络剧热度
html = response.text
data = json.loads(html)
data = data['dataList']['list']
for nbh_data in data:
data_currHeatDesc = nbh_data['currHeatDesc'] # 实时热度
data_name = nbh_data['seriesInfo']['name'] # 剧名
data_platformDesc = nbh_data['seriesInfo']['platformDesc'] # 播放资源平台
data_releaseInfo = nbh_data['seriesInfo']['releaseInfo'] # 上线天数
data_playCountSplitUnit = str(nbh_data.get('playCountSplitUnit', {}).get('num', '')) + nbh_data.get('playCountSplitUnit', {}).get('unit', '') # 实时播放量
data_res=[{'剧名':data_name,
'播放资源平台':data_platformDesc,
'上线天数':data_releaseInfo,
'实时播放量':data_playCountSplitUnit,
'实时热度':data_currHeatDesc
}]
path = "../data/E猫眼电影网络剧热度.csv"
append_data_to_csv(data_res,path)
print(data_res[0]["剧名"],"成功写入!")
山海经可视化
最后使用山海经可视化,把数据导入做了一个界面如下图。(不需要前端基础也可以做)
最后
这里面没有什么讲解内容,主要就是展示一下该项目,有需要的朋友可以自行拿去用,需要什么copy什么,谢谢观看。