import pandas as pd
import os
import numpy as np
import matplotlib. pyplot as plt
os. chdir( "E:\\code\\python_basic\\项目01_电影数据处理及分析实战\\" )
data = pd. read_csv( '爱奇艺视频数据.csv' , engine = 'python' )
print ( data. head( ) )
def data_cleaning ( df) :
cols = df. columns
for col in cols:
if df[ col] . dtype == 'object' :
df[ col] . fillna( '缺失数据' , inplace = True )
else :
df[ col] . fillna( 0 , inplace = True )
return ( df)
data_c1 = data_cleaning( data)
print ( data_c1. head( 10 ) )
数据清洗 - 时间标签转化 需要将中文日期转化为非中文日期,例如 2016年5月24日 → 2016.5.24
def data_time ( df, col) :
df[ col] = df[ col] . str . replace( '年' , '.' )
df[ col] = df[ col] . str . replace( '月' , '.' )
df[ col] = df[ col] . str . replace( '日' , '' )
df[ col] = pd. to_datetime( df[ col] )
return ( df)
data_c2 = data_time( data_c1, '数据获取日期' )
print ( data_c2. head( 10 ) )
df_q1 = data_c2. groupby( '导演' ) [ [ '好评数' , '评分人数' ] ] . sum ( )
df_q1[ '好评率' ] = df_q1[ '好评数' ] / df_q1[ '评分人数' ]
result_q1 = df_q1. sort_values( [ '好评率' ] , ascending= False ) [ : 20 ]
result_q1[ '好评率' ] . plot( kind= 'bar' ,
color = 'k' ,
width = 0.8 ,
alpha = 0.1 ,
rot = 45 ,
grid = False ,
ylim = [ 0.98 , 1 ] ,
figsize = ( 12 , 4 ) ,
title = '不同导演电影的好评率' )
q2data1 = data_c2[ [ '导演' , '上映年份' , '整理后剧名' ] ] . drop_duplicates( )
q2data1 = q2data1[ q2data1[ '上映年份' ] != 0 ]
q2data2 = data_c2. groupby( '整理后剧名' ) . sum ( ) [ [ '评分人数' , '好评数' ] ]
q2data3 = pd. merge( q2data1, q2data2, left_on= '整理后剧名' , right_index= True )
q2data4 = q2data3. groupby( '上映年份' ) . sum ( ) [ [ '评分人数' , '好评数' ] ]
fig1 = plt. figure( num= 1 , figsize= ( 12 , 4 ) )
q2data4[ '评分人数' ] . loc[ 2000 : ] . plot. area( figsize = ( 10 , 4 ) ,
grid = True ,
color = 'g' ,
alpha = 0.8 )
plt. xticks( range ( 2001 , 2016 ) )
plt. title( '2001-2016年每年评影人数总量统计' )
fig, axes = plt. subplots( 4 , 4 , figsize= ( 10 , 16 ) )
start = 2001
for i in range ( 4 ) :
for j in range ( 4 ) :
data = q2data3[ q2data3[ '上映年份' ] == start]
data[ [ '评分人数' , '好评数' ] ] . boxplot( whis = 3 ,
return_type= 'dict' , ax = axes[ i, j] )
start += 1
a = q2data3[ q2data3[ '上映年份' ] == 2001 ]
def data_error ( df, col) :
q1 = df[ col] . quantile( q= 0.25 )
q3 = df[ col] . quantile( q= 0.75 )
iqr = q3 - q1
tmax = q3 + 3 * iqr
tmin = q3 - 3 * iqr
return ( tmax, tmin)
for i in range ( 2000 , 2016 ) :
datayear = q2data3[ q2data3[ '上映年份' ] == i]
print ( '%i年有%i条数据' % ( i, len ( datayear) ) )
t = data_error( datayear, '评分人数' )
print ( datayear[ datayear[ '评分人数' ] > t[ 0 ] ] )
print ( '-------\n' )