Python分析《黄金兄弟》，到底是靠实力还是打情怀？？

本文链接：https://blog.csdn.net/ljlj8888/article/details/82838212

由”古惑仔“原班人马主演的《黄金兄弟》在21号上映了，相信《古惑仔》系列电影在我们心中留下了许多深刻的印象，上映五天票房据首。

但在豆瓣上却只有5.3的评分

由”古惑仔“原班人马主演一部全新的电影还是会引起许多争议，就好像不久前的《爱情公寓》。

到底是靠实力，还是打情怀？？我模仿、借鉴网上的同类型的技术贴，用Python爬取了猫眼电影的5000多条评论，分析《黄金兄弟》到底好不好看，用数据说话。

一、数据获取

import requests
import json
import random
import threading
import time

class MyThread(threading.Thread):
    def __init__(self,func,data1,data2):
        threading.Thread.__init__(self)
        self.func=func
        self.data1=data1
        self.data2=data2
    def run(self):
        self.func(self.data1,self.data2)

def get_one_page(url):
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'}
    r = requests.get(url,headers=headers)
    if r.status_code == 200:
        return r.text
    else:
        print('网络链接错误！')
        return None

def parse_one_page(con):
    data=json.loads(con)['cmts']
    onelist=[]
    for item in data:
        oneld={
            'nickName': item['nickName'],
            'cityName': item['cityName'],
            'time': item['time'],
            'score':item['score'],
            'content': item['content']
        }
        onelist.append(oneld)
    return onelist


def save_txt(i,j):#爬取上映前5天的评论
    url = 'http://m.maoyan.com/mmdb/comments/movie/1204774.json?_v_=yes&offset='+str(i)+'&startTime=2018-09-2'+str(j)+'%2022%3A25%3A03'
    con = get_one_page(url)
    lock.acquire()
    print("正在保存2{0}号第{1}页" .format(j,i))
    for item in parse_one_page(con):
        with open('E:/黄金兄弟.txt', 'a', encoding='utf-8') as f:
            f.write(
                item['time'] + ',' + item['cityName'] + ',' + item['nickName'] + ',' + str(item['score']) + ',' + item[
                    'content'] + '\n')
    lock.release()

if __name__=='__main__':
    lock=threading.Lock()
    threads=[]
    for j in range(1,6):
        for i in range(0, 1001):
            thread = MyThread(save_txt,i,j)#定义多线程
            thread.start()
            threads.append(thread)
        time.sleep(15)#防止被吞
    for k in threads:
        k.join()

由上面代码抓出了上万条信息，

但是，你会发现有很大部分的信息是重复的！！于是得进行数据去重处理。

二、数据去重处理

def que_con(get,out):#数据处理，去重
    of=open(out,'w',encoding='utf-8')
    with open(get,'r',encoding='utf-8') as gf:
        lines=gf.readlines()
        newlines=[]
        count=0
        for line in lines:
            if line not in newlines:
                of.write(line)
                newlines.append(line)
                count+=1
        print(count)
if __name__=='__main__':
    que_con('E:/黄金兄弟.txt','E:/New黄金兄弟.txt')

最后仅仅得到了5000多条数据。。下面进行数据可视化。

三、数据可视化

1、粉丝分布图：

from pyecharts import Style
from pyecharts import Geo
import json

cities = []
newcities=[]
with open('E:/New黄金兄弟.txt',mode='r',encoding='utf-8') as f:
    rows = f.readlines()
    for row in rows:
        if len(row.split(',')) == 5:
            cities.append(row.split(',')[1].replace('\n',''))

#用抓取来的城市数据与pyecharts地图上的数据比较，使城市名与pyecharts地图的城市名一样
with open('D:\Python3.7.0\Lib\site-packages\pyecharts\datasets\city_coordinates.json','r',encoding='utf-8') as f:
    fdata = json.loads(f.read())
for city in cities:
    if city == '':
        cities.remove(city)
    for k in fdata.keys():
        if k == city:
            newcities.append(k)
            break
        if k.startswith(city):
            newcities.append(k)
            break

def all_list(arr):
    result = {}
    for i in set(arr):
        result[i] = arr.count(i)
    return result
data = []
for item in all_list(newcities):
    data.append((item,all_list(newcities)[item]))
print(data)
style = Style(
        title_color = "#fff",
        title_pos = "center",
        width = 1200,
        height = 600,
        background_color = "#404a59"
        )

geo = Geo("《黄金兄弟》粉丝人群地理位置","数据来源：猫***评论",**style.init_style)
attr, value = geo.cast(data)
geo.add(
    "",
    attr,
    value,
    visual_range=[0, 200],
    visual_text_color="#fff",
    symbol_size=15,
    is_visualmap=True,
)
geo.render('地图分布.html')

可以看出还是北上广一带的用户相对较多，其中深圳最多。。

2、星级饼图

猫眼评论分数为（0.5，1，1.5，2，2.5，3，3.5，4，4.5，5），我们以4.5，5为五星，以此类推。

from pyecharts import Pie
scores=[]
with open('E:/New黄金兄弟.txt',mode='r',encoding='utf-8') as f:
    rows = f.readlines()
    for row in rows:
        if len(row.split(',')) == 5:
            scores.append(row.split(',')[3].replace('\n',''))
attr=['一星','二星','三星','四星','五星']#5与4.5为五星，以此类推
v1=scores.count('0.5')+scores.count('1')
v2=scores.count('1.5')+scores.count('2')
v3=scores.count('2.5')+scores.count('3')
v4=scores.count('3.5')+scores.count('4')
v5=scores.count('4.5')+scores.count('5')
value=[v1,v2,v3,v4,v5]
print(attr)
print(value)

pie = Pie("饼图-《黄金兄弟》星级图示例", title_pos='center', width=900)
pie.add("", attr, value, center=[75, 50], is_random=True,
        radius=[30, 75], rosetype='area',
        is_legend_show=False, is_label_show=True)
pie.render('饼图.html')

由饼图可以看出，这部电影好评率还是挺高的，其中五星和四星就占了81.44%。

3、词云

import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS
comments = []
with open('E:/New黄金兄弟.txt', mode='r', encoding='utf-8') as f:
    rows = f.readlines()
    for row in rows:
        if len(row.split(','))==5:
            comments.append(row.split(',')[4].replace('\n','，'))

word_jie=jieba.lcut(str(comments), cut_all=False)#jieba库分词
words=' '.join(word_jie)
#屏蔽某些敏感词
stopwords = STOPWORDS.copy()
stopwords.add('电影')
stopwords.add('一部')
stopwords.add('一个')
stopwords.add('没有')
stopwords.add('什么')
stopwords.add('有点')
stopwords.add('这部')
stopwords.add('这个')
stopwords.add('不是')
stopwords.add('真的')
stopwords.add('感觉')
stopwords.add('觉得')
stopwords.add('还是')
stopwords.add('但是')
stopwords.add('就是')
stopwords.add('黄金')
stopwords.add('兄弟')
stopwords.add('啊啊')
# 设置词云参数，参数分别表示：画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小
wc = WordCloud(width=1024, height=768, background_color='white', font_path='STKAITI.TTF',
               stopwords=stopwords, max_font_size=400, random_state=50)
# 将分词后数据传入云图
wc.generate_from_text(words)
plt.imshow(wc)
plt.axis('off')  # 不显示坐标轴
plt.show()
# 保存结果到本地
wc.to_file('词云图.jpg')