import numpy as np
import pandas as pd
import jieba
import wordcloud
from scipy.misc import imread
import matplotlib.pyplot as plt
from pylab import mpl
mpl.rcParams[‘font.sans-serif’] = [‘SimHei’] # 指定默认字体
mpl.rcParams[‘axes.unicode_minus’]
import matplotlib.style as psl
psl.use(‘bmh’)
设置风格
import warnings
warnings.filterwarnings(‘ignore’)
不发出警告
print(‘导入完成!’)
import os
os.chdir(‘E:/BaiduNetdiskDownload/大鹏教你玩数据粉丝分享-包含历史期次/学员分享-票房-大鹏教你玩数据2019-01期/大鹏教你玩数据第一期_代码资料/’)
创建工作路径
data = pd.read_csv(‘中国票房数据爬取测试20071-20192.csv’,engine=‘python’)
print(data.head())
#周票房TOP1分析
dataTop1_week = data[data[‘排名’]==1][[‘电影名’,‘周票房’]]
dataTop1_week = dataTop1_week.groupby(‘电影名’).max()[‘周票房’].reset_index()
dataTop1_week = dataTop1_week.sort_values(by=‘周票房’,ascending=False)
dataTop1_week.index = dataTop1_week[‘电影名’]
del dataTop1_week[‘电影名’]
dataTop1_week.to_excel(‘d01周票房TOP1排名.xlsx’,index=0) # 导出数据
print(‘导出完成’)
dataTop1_week[:20].iloc[::-1].plot.barh(figsize = (6,10))
plt.title(‘周票房TOP20分析’)
plt.savefig(‘p01周票房TOP20分析.png’,dpi=400)
print(dataTop1_week.head())
#总票房TOP1分析
dataTop1_sum = data[data[‘排名’]==1][[‘电影名’,‘总票房’]]
dataTop1_sum = dataTop1_sum.groupby(‘电影名’).max()[‘总票房’].reset_index()
dataTop1_sum = dataTop1_sum.sort_values(by=‘总票房’,ascending=False)
dataTop1_sum.index = dataTop1_sum[‘电影名’]
del dataTop1_sum[‘电影名’]
dataTop1_sum.to_excel(‘d02总票房TOP1排名.xlsx’,index=0) # 导出数据
print(‘导出完成’)
dataTop1_sum[:20].iloc[::-1].plot.barh(figsize = (6,10),color = ‘green’)
plt.title(‘总票房TOP20分析’)
plt.savefig(‘p02总票房TOP20分析.png’,dpi=400)
dataTop1_sum.head()
#上榜次数TOP10
data_count = data.groupby(‘电影名’).count()[‘排名’].reset_index()
data_count.columns = [‘电影名’,‘频数’]
data_count = data_count.sort_values(by=‘频数’,ascending=False)
data_count.index = data_count[‘电影名’]
del data_count[‘电影名’]
data_count.to_excel(‘d03上榜次数TOP10.xlsx’,index=0) # 导出数据
print(‘导出完成’)
dataTop1_sum[:10].plot(kind = ‘bar’,figsize = (12,4),color = ‘orange’)
plt.title(‘上榜次数TOP10’)
plt.savefig(‘p03上榜次数TOP10.png’,dpi=400)
data_count.head()
#按时间分析票房变化
dataTime = data[[‘票房日期’,‘单周总票房’,‘单周总场次’,‘单周总人次’]]
dataTime[‘票房日期’] = dataTime[‘票房日期’].str.split(‘至’).apply(lambda x:x[0])
dataTime.index = pd.to_datetime(dataTime[‘票房日期’],format="%Y/%m/%d")
dataTime = dataTime.sort_values(by=‘票房日期’).drop_duplicates()
del dataTime[‘票房日期’]
dataTime.to_excel(‘d04票房变化.xlsx’,index=0) # 导出数据
print(‘导出完成’)
dataTime[‘单周总票房’].plot(kind=‘line’,figsize = (12,4),title = ‘2008-2019单周榜单总票房’,alpha = 0.8)
plt.savefig(‘p042008-2019单周榜单总票房.png’,dpi=400)
dataTime.head()
词频分析
path=‘E:/BaiduNetdiskDownload/大鹏教你玩数据粉丝分享-包含历史期次/学员分享-票房-大鹏教你玩数据2019-01期/大鹏教你玩数据第一期_代码资料/停用词.txt’
f=open(path,encoding=‘utf-8’)#无法读取改成这样
stop_word = pd.read_csv(f)
stop_word.columns = [‘key’,’’]
stop_list = stop_word[‘key’].tolist()
print(stop_list[:5])
# 停用词读取
def txt_cut(f):
return [w for w in jieba.cut(f) if w not in stop_list ] # 创建函数
word_list = []
for line in data[data[‘电影名’].notnull()][‘电影名’].tolist():
for word in txt_cut(line):
word_list.append(word)
print(word_list[:5])
# 分词
word_count = pd.Series(word_list).value_counts().sort_values(ascending=False)[0:20]
word_count
# 分析词频
出图
import seaborn as sns # 导入seaborn
fig = plt.figure(figsize=(12,5))
x = word_count.index.tolist()
y = word_count.values.tolist()
sns.barplot(x, y, palette=“BuPu_r”)
plt.title(‘词频Top20’)
plt.ylabel(‘count’)
sns.despine(bottom=True)
plt.savefig(‘p05词频分析.png’,dpi=400)
词频分析
fig = plt.figure(figsize=(10,10))
cloud = wordcloud.WordCloud(font_path=‘SimHei.TTF’,
mask = imread(‘background.png’),
mode=‘RGBA’,
background_color=None
).generate(’ '.join(word_list))
img = imread(‘color.jpg’)
cloud_colors = wordcloud.ImageColorGenerator(np.array(img))
cloud.recolor(color_func=cloud_colors)
plt.imshow(cloud)
plt.axis(‘off’)
plt.savefig(‘p06wordcloud.png’,dpi=800)