Python大数据分析挖掘和影评分析

最新推荐文章于 2024-02-08 17:30:00 发布

Doctor.CG

最新推荐文章于 2024-02-08 17:30:00 发布

阅读量163

点赞数

文章标签： python 数据分析开发语言

本文链接：https://blog.csdn.net/m0_52501209/article/details/134133364

版权

#coding:utf-8
import pandas as pd
import datetime
#读取原始文件，并给各列添加标记列
df=pd.read_csv('/data/workspace/film_log.csv',delimiter=';',encoding='utf-8',names=[u'电影名称',u'上线时间',u'下线时间',u'公司',u'导演',u'主演',u'类型',u'票房',u'城市'])#
# print df[df.isnull().values] #查看数据空值
ans0301=df[df[u'电影名称']==u'《冲上云霄》'].loc[:,[u'电影名称',u'上线时间',u'下线时间',u'票房',u'城市']] #筛选影片《冲上云霄》数据
ans0301=ans0301.drop_duplicates().reset_index().drop('index',axis=1) #去重清洗
ans0301[u'票房']=ans0301[u'票房'].str.split(u'）').str[1].astype(float) #清洗票房列数据。且转为float类型
ans0301[u'上线时间']=pd.to_datetime(ans0301[u'上线时间']) #将时间列转换类时间类型
ans0301[u'下线时间']=pd.to_datetime(ans0301[u'下线时间'])
day=(ans0301[u'下线时间'].max()-ans0301[u'上线时间'].min()).days+1 #上映总天数
# print day
# print ans0301
box=ans0301[u'票房'].sum() #总票房
avg_box=box/day #平均票房
f=open('/data/workspace/box.txt','w')
f.write('%Ld,%0.6f'%(day,avg_box)) #按要求存入box.txt文件
f.close()

#coding:utf-8
import pandas as pd
import datetime
import matplotlib.pyplot as plt
#读取原始文件，按照；切分，设置编码格式utf-8，并给各列添加标记列
df=pd.read_csv('/data/workspace/film_log.csv',delimiter=';',encoding='utf-8',names=[u'电影名称',u'上线时间',u'下线时间',u'公司',u'导演',u'主演',u'类型',u'票房',u'城市'])
# print df[df.isnull().values]
film=[u'《冲上云霄》',u'《少年班》',u'《紫霞》'] #在列表存放三部影片
dfz=pd.DataFrame(columns=[u'票房'])
zong_box=[]
for k in range(len(film)):
ans0302=df[df[u'电影名称']==film[k]].loc[:,[u'电影名称',u'上线时间',u'下线时间',u'票房',u'城市']] #筛选三部影片数据
ans0302 = ans0302.drop_duplicates().reset_index().drop('index', axis=1) #去重清洗
ans0302[u'票房'] = ans0302[u'票房'].str.split(u'）').str[1].astype(float) #清洗票房列数据。且转为float类型
ans0302[u'上线时间'] = pd.to_datetime(ans0302[u'上线时间']) #将时间列转换类时间类型
ans0302[u'下线时间'] = pd.to_datetime(ans0302[u'下线时间'])
day = (ans0302[u'下线时间'