猫眼电影专业版数据爬取

猫眼电影专业版数据爬取

最近学习了一点简单的爬虫,利用山海鲸APP做了一个简单的可视化界面,接下来把我的内容分享给大家。

1.导入包

以下是这次爬取数据需要用到的包,当然也可以使用Xpath、或者是beautifulsoup凭个人喜好来。

import requests 
import csv
from fake_useragent import UserAgent # ⽣成随机User-Agent以避免被识别为⾃动化脚本
import json

2.关于函数

这了存储数据的格式类型为.csv然后因为我代码没进行优化重复性很高,因此,封装了一个存储数据的函数如下。

def append_data_to_csv(data,path):

    # 写入CSV文件
    with open(path, "a", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=data[0].keys())

        # 如果文件为空,写入表头
        if csv_file.tell() == 0:
            writer.writeheader()

        # 写入新数据
        writer.writerows(data)

3.访问猫眼电影专业版

这里访问的是动态网页通过Network中的Fetch/XHR中找到的URL。如下图所示。

在这里插入图片描述

代码如下:

# 获取电影票房信息Comprehensive box office
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}

url = 'https://piaofang.maoyan.com/dashboard-ajax?orderType=0&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717473044447&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=647&channelId=40009&sVersion=2&signKey=5de146e3635549eaa1a765c2ac929c97'
response = requests.get(url,headers=headers)

if response.ok:
    print(response.status_code)
else:
    print(response.status_code, "请求失败")

4.获取电影票房信息

# 获取电影票房信息Comprehensive box office
html = response.text
data = json.loads(html)
data = data["movieList"]["data"]["list"]

data数据样式如下:

[{'avgSeatView': '2.9%',
  'avgShowView': '4.2',
  'boxRate': '53.3%',
  'boxSplitUnit': {'num': '.',
   'unit': '万'},
  'movieInfo': {'movieId': 1371016,
   'movieName': '云边有个小卖部',
   'releaseInfo': '上映4天'},
  'showCount': 115665,
  'showCountRate': '30.9%',
  'splitBoxRate': '53.3%',
  'splitBoxSplitUnit': {'num': '.',
   'unit': '万'},
  'sumBoxDesc': '1.97亿',
  'sumSplitBoxDesc': '1.78亿'},
 {'avgSeatView': '1.7%',
  'avgShowView': '2.3',
  'boxRate': '15.0%',
  'boxSplitUnit': {'num': '.',
   'unit': '万'},
  'movieInfo': {'movieId': 1479534,
   'movieName': '头脑特工队2',
   'releaseInfo': '上映5天'},
  'showCount': 57860,
  'showCountRate': '15.4%',
  'splitBoxRate': '14.9%',
  'splitBoxSplitUnit': {'num': '.',
   'unit': '万'},
  'sumBoxDesc': '8864.1万',
  'sumSplitBoxDesc': '7992.7万'},
  …………………………
 {'avgSeatView': '0.3%',
  'avgShowView': '0.2',
  'boxRate': '<0.1%',
  'boxSplitUnit': {'num': '&#xf726;.&#xf726;&#xf726;', 'unit': '万'},
  'movieInfo': {'movieId': 1469891,
   'movieName': '阿搭嫂(戏曲 高甲戏)',
   'releaseInfo': '上映首日'},
  'showCount': 6,
  'showCountRate': '<0.1%',
  'splitBoxRate': '<0.1%',
  'splitBoxSplitUnit': {'num': '&#xf726;.&#xf726;&#xf726;', 'unit': '万'},
  'sumBoxDesc': '33',
  'sumSplitBoxDesc': '30'}]

5.获取自己想要的电影信息

代码如下:

for cbo_data in data:
    # 上座率数据缺省值这么使用数据
    data_avgSeatView = cbo_data['avgSeatView']
    # print(data_avgSeatView)
    # 场均人次
    data_avgShowView=cbo_data['avgShowView']
    # 票房占比
    data_boxRate=cbo_data['boxRate']
    # 电影名称
    data_name=cbo_data['movieInfo']['movieName']
    # 上映时间
    data_time=cbo_data['movieInfo']['releaseInfo']
    # 综合票房
    data_sumBoxDesc=cbo_data['sumBoxDesc']
    # 排片场次
    data_showCount=cbo_data['showCount']
    # 排片占比
    data_showCountRate=cbo_data['showCountRate']
    data_res=[{'电影名称':data_name,
               '上映时间':data_time,
               '上座率':data_avgSeatView,
               '场均人次':data_avgShowView,
               '票房占比':data_boxRate,
               '综合票房':data_sumBoxDesc,
               '排片场次':data_showCount,
               '排片占比':data_showCountRate
             }]
    path = "../data/A猫眼电影综合票房.csv"
    append_data_to_csv(data_res,path)
    print(data_res[0]["电影名称"],"成功写入!")

全部代码

上面以获取电影信息为例展示代码,下面是获取该网页下的不同信息的代码,原理与上面一致,重复率很高,基本上就是修改url以及变量名。

#导入包
import requests 
import csv
from fake_useragent import UserAgent # ⽣成随机User-Agent以避免被识别为⾃动化脚本
import json

def append_data_to_csv(data,path):

    # 写入CSV文件
    with open(path, "a", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=data[0].keys())

        # 如果文件为空,写入表头
        if csv_file.tell() == 0:
            writer.writeheader()

        # 写入新数据
        writer.writerows(data)

# 获取电影票房信息Comprehensive box office
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}

url = 'https://piaofang.maoyan.com/dashboard-ajax?orderType=0&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717473044447&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=647&channelId=40009&sVersion=2&signKey=5de146e3635549eaa1a765c2ac929c97'
response = requests.get(url,headers=headers)

if response.ok:
    print(response.status_code)
else:
    print(response.status_code, "请求失败")

# 获取电影票房信息Comprehensive box office
html = response.text
data = json.loads(html)
data = data["movieList"]["data"]["list"]

for cbo_data in data:
    # 上座率数据缺省值这么使用数据
    data_avgSeatView = cbo_data['avgSeatView']
    # print(data_avgSeatView)
    # 场均人次
    data_avgShowView=cbo_data['avgShowView']
    # 票房占比
    data_boxRate=cbo_data['boxRate']
    # 电影名称
    data_name=cbo_data['movieInfo']['movieName']
    # 上映时间
    data_time=cbo_data['movieInfo']['releaseInfo']
    # 综合票房
    data_sumBoxDesc=cbo_data['sumBoxDesc']
    # 排片场次
    data_showCount=cbo_data['showCount']
    # 排片占比
    data_showCountRate=cbo_data['showCountRate']
    data_res=[{'电影名称':data_name,
               '上映时间':data_time,
               '上座率':data_avgSeatView,
               '场均人次':data_avgShowView,
               '票房占比':data_boxRate,
               '综合票房':data_sumBoxDesc,
               '排片场次':data_showCount,
               '排片占比':data_showCountRate
             }]
    path = "../data/A猫眼电影综合票房.csv"
    append_data_to_csv(data_res,path)
    print(data_res[0]["电影名称"],"成功写入!")


# 获取网播热度Network broadcast heat
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}

url = 'https://piaofang.maoyan.com/dashboard/webHeatData?showDate=20240604&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717472862330&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=213&channelId=40009&sVersion=2&signKey=cb852978f92bcd860cce84b94f6df03a'
response = requests.get(url,headers=headers)

if response.ok:
    print(response.status_code)
else:
    print(response.status_code, "请求失败")

# 获取网播热度Network broadcast heat
html = response.text
data = json.loads(html)
data = data['dataList']['list']

for nbh_data in data:
    data_currHeatDesc = nbh_data['currHeatDesc'] # 实时热度
    data_name = nbh_data['seriesInfo']['name'] # 剧名
    data_platformDesc = nbh_data['seriesInfo']['platformDesc'] # 播放资源平台
    data_releaseInfo = nbh_data['seriesInfo']['releaseInfo'] # 上线天数
    data_playCountSplitUnit = str(nbh_data.get('playCountSplitUnit', {}).get('num', '')) + nbh_data.get('playCountSplitUnit', {}).get('unit', '') # 实时播放量
    
    data_res=[{'剧名':data_name,
               '播放资源平台':data_platformDesc,
               '上线天数':data_releaseInfo,
               '实时播放量':data_playCountSplitUnit,
               '实时热度':data_currHeatDesc
             }]
    path = "../data/B猫眼电影网播热度.csv"
    append_data_to_csv(data_res,path)
    print(data_res[0]["剧名"],"成功写入!")


# 获取电视收视TV ratings
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}

url = 'https://piaofang.maoyan.com/dashboard/getTVData?startTime=00:00&endTime=24:00&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717470894813&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=624&channelId=40009&sVersion=2&signKey=40202fce4a917830ac28191498a1a45f'
response = requests.get(url,headers=headers)

if response.ok:
    print(response.status_code)
else:
    print(response.status_code, "请求失败")

# 获取电视收视TV ratings
html = response.text
data = json.loads(html)
data = data['tvListAll']['data']['list']

for TVratings in data:
    # 实时收视率
    data_attentionRateDesc = TVratings['attentionRateDesc']
    # 市占率
    data_marketRateDesc = TVratings['marketRateDesc']
    # 剧名
    data_programmeName = TVratings['programmeName']
    data_res=[{'剧名':data_programmeName,
               '实时收视率':data_attentionRateDesc,
               '市占率':data_marketRateDesc
             }]
    path = "../data/C猫眼电影电视收视.csv"
    append_data_to_csv(data_res,path)
    print(data_res[0]["剧名"],"成功写入!")


# 获取电视剧热度
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}

url = 'https://piaofang.maoyan.com/dashboard/webHeatData?showDate=20240605&seriesType=0&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717550798127&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=235&channelId=40009&sVersion=2&signKey=da4746ebaeaa2911ff2bc3cd85561d2a'
response = requests.get(url,headers=headers)

if response.ok:
    print(response.status_code)
else:
    print(response.status_code, "请求失败")

# 获取电视剧热度
html = response.text
data = json.loads(html)
data = data['dataList']['list']

for nbh_data in data:
    data_currHeatDesc = nbh_data['currHeatDesc'] # 实时热度
    data_name = nbh_data['seriesInfo']['name'] # 剧名
    data_platformDesc = nbh_data['seriesInfo']['platformDesc'] # 播放资源平台
    data_releaseInfo = nbh_data['seriesInfo']['releaseInfo'] # 上线天数
    data_playCountSplitUnit = str(nbh_data.get('playCountSplitUnit', {}).get('num', '')) + nbh_data.get('playCountSplitUnit', {}).get('unit', '') # 实时播放量
    
    data_res=[{'剧名':data_name,
               '播放资源平台':data_platformDesc,
               '上线天数':data_releaseInfo,
               '实时播放量':data_playCountSplitUnit,
               '实时热度':data_currHeatDesc
             }]
    path = "../data/D猫眼电影电视剧热度.csv"
    append_data_to_csv(data_res,path)
    print(data_res[0]["剧名"],"成功写入!")


# 获取网络剧热度
# 初始化随机User-Agent,⽤于伪装成不同的⽤户浏览器访问
ua = UserAgent()
headers={'User-Agent': ua.random}

url = 'https://piaofang.maoyan.com/dashboard/webHeatData?showDate=20240605&seriesType=1&uuid=18fe0a801fec8-0da5ff1dd65145-4c657b58-1bcab9-18fe0a801fec8&timeStamp=1717551331397&User-Agent=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEyNS4wLjAuMCBTYWZhcmkvNTM3LjM2IEVkZy8xMjUuMC4wLjA%3D&index=127&channelId=40009&sVersion=2&signKey=57bf729c276d4207002252c4869f6e15'
response = requests.get(url,headers=headers)

if response.ok:
    print(response.status_code)
else:
    print(response.status_code, "请求失败")

# 获取网络剧热度
html = response.text
data = json.loads(html)
data = data['dataList']['list']

for nbh_data in data:
    data_currHeatDesc = nbh_data['currHeatDesc'] # 实时热度
    data_name = nbh_data['seriesInfo']['name'] # 剧名
    data_platformDesc = nbh_data['seriesInfo']['platformDesc'] # 播放资源平台
    data_releaseInfo = nbh_data['seriesInfo']['releaseInfo'] # 上线天数
    data_playCountSplitUnit = str(nbh_data.get('playCountSplitUnit', {}).get('num', '')) + nbh_data.get('playCountSplitUnit', {}).get('unit', '') # 实时播放量
    
    data_res=[{'剧名':data_name,
               '播放资源平台':data_platformDesc,
               '上线天数':data_releaseInfo,
               '实时播放量':data_playCountSplitUnit,
               '实时热度':data_currHeatDesc
             }]
    path = "../data/E猫眼电影网络剧热度.csv"
    append_data_to_csv(data_res,path)
    print(data_res[0]["剧名"],"成功写入!")

山海经可视化

最后使用山海经可视化,把数据导入做了一个界面如下图。(不需要前端基础也可以做)

在这里插入图片描述

最后

这里面没有什么讲解内容,主要就是展示一下该项目,有需要的朋友可以自行拿去用,需要什么copy什么,谢谢观看。

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值