导入相关函数库
import numpy as np
import pandas as pd
import matplotlib. pylab as plt
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
导入数据
use_basicinf = pd. read_csv( './附件3:用户基本信息.csv' , encoding = 'gbk' )
use_basicinf. head( )
use_rankinf = pd. read_excel( './附件1:用户收视信息.xlsx' , encoding = 'gbk' , sheet_name= '用户收视信息' )
use_rankinf. head( )
use_backinf = pd. read_excel( './附件1:用户收视信息.xlsx' , encoding = 'gbk' , sheet_name= '用户回看信息' )
use_backinf. head( )
use_orderinf = pd. read_excel( './附件1:用户收视信息.xlsx' , encoding = 'gbk' , sheet_name= '用户单片点播信息' )
use_orderinf. head( )
new_rankinf = pd. merge( use_basicinf, use_rankinf, left_on = '机顶盒编号' , right_on = '机顶盒设备号' )
new_rankinf. head( )
重命名列标题
```python
time_1 = new_rankinf[ [ '用户号' , '收看开始时间' , '收看结束时间' ] ]
time_1
time_2 = use_backinf[ [ '用户号' , '回看开始时间' , '回看结束时间' ] ]
time_2
time_3 = use_orderinf[ [ '用户号' , '观看开始时间' , '观看结束时间' ] ]
time_3
time_1. columns = [ '用户号' , '开始时间' , '结束时间' ]
time_2. columns = [ '用户号' , '开始时间' , '结束时间' ]
time_3. columns = [ '用户号' , '开始时间' , '结束时间' ]
time_1. head( )
相同索引连接
time = pd. concat( [ time_1, time_2, time_3] )
time
数据处理
time[ time. duplicated( ) ]
time = time. drop_duplicates( )
time
time[ 'time' ] = time[ '结束时间' ] - time[ '开始时间' ]
time. head( )
time_stats = time. groupby( '用户号' ) [ 'time' ] . sum ( )
time_stats
for i in time_stats:
print ( i)
print ( "*" * 100 )
数据展示
time_stats[ time_stats == time_stats. max ( ) ]
数据规范化
df = pd. read_excel( './normalization_data.xls' , header = None )
df
( df- df. min ( ) ) / ( df. max ( ) - df. min ( ) )
from sklearn. preprocessing import MinMaxScaler
model = MinMaxScaler( )
model. fit_transform( df)
( df - df. mean( ) ) / df. std( )
from sklearn. preprocessing import StandardScaler
model = StandardScaler( )
model. fit_transform( df)
df/ 10 ** np. ceil( np. log10( df. abs ( ) . max ( ) ) )
from sklearn. preprocessing import MaxAbsScaler
model = MaxAbsScaler( )
model. fit_transform( df)