import pandas as pd
import numpy as np
import os
% matplotlib inline
os. chdir( r"C:\Users\Hans\Desktop\data_analysis\test_data\movie" )
df = pd. read_csv( "ratings.dat" , sep = "::" , engine= "python" , header= None , names= [ 'UserID' , 'MoiveID' , 'Rating' , 'Timestamp' ] )
df. head( )
UserID MoiveID Rating Timestamp 0 1 1193 5 978300760 1 1 661 3 978302109 2 1 914 3 978301968 3 1 3408 4 978300275 4 1 2355 5 978824291
df[ "pdate" ] = pd. to_datetime( df[ "Timestamp" ] , unit= 's' )
df. head( )
UserID MoiveID Rating Timestamp pdate 0 1 1193 5 978300760 2000-12-31 22:12:40 1 1 661 3 978302109 2000-12-31 22:35:09 2 1 914 3 978301968 2000-12-31 22:32:48 3 1 3408 4 978300275 2000-12-31 22:04:35 4 1 2355 5 978824291 2001-01-06 23:38:11
df_group = df. groupby( [ df[ 'pdate' ] . dt. month, "Rating" ] ) [ "UserID" ] . agg( { 'pv' : np. sum } )
df_group. head( )
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
pv pdate Rating 1 1 2613452 2 5294359 3 12273331 4 16452340 5 9580889
df_month = df[ 'pdate' ] . dt. month
df_month. head( 6 )
0 12
1 12
2 12
3 12
4 1
5 12
Name: pdate, dtype: int64
df_day = df[ 'pdate' ] . dt. day
df_day. head( 6 )
0 31
1 31
2 31
3 31
4 6
5 31
Name: pdate, dtype: int64
df_group
pv pdate Rating 1 1 2613452 2 5294359 3 12273331 4 16452340 5 9580889 2 1 1608795 2 3747792 3 8608216 4 10941690 5 5819374 3 1 1344378 2 2772172 3 6156512 4 6911661 5 3228406 4 1 4397883 2 10147641 3 25330546 4 30406281 5 18070625 5 1 24893943 2 41021601 3 99671422 4 138699142 5 98385825 6 1 15896973 2 31940801 3 75297592 4 109067096 5 72816021 7 1 22506252 2 44079974 3 118016191 4 163188814 5 104777329 8 1 42290096 2 80215303 3 196812819 4 250299153 5 165391463 9 1 10185866 2 19751997 3 49514561 4 66638484 5 43733790 10 1 6825621 2 14936083 3 37937741 4 49413212 5 31949765 11 1 29762500 2 55254306 3 133879416 4 181288004 5 118358977 12 1 5474475 2 11043533 3 25870578 4 33351106 5 19000303
df_stack = df_group. unstack( )
df_stack
pv Rating 1 2 3 4 5 pdate 1 2613452 5294359 12273331 16452340 9580889 2 1608795 3747792 8608216 10941690 5819374 3 1344378 2772172 6156512 6911661 3228406 4 4397883 10147641 25330546 30406281 18070625 5 24893943 41021601 99671422 138699142 98385825 6 15896973 31940801 75297592 109067096 72816021 7 22506252 44079974 118016191 163188814 104777329 8 42290096 80215303 196812819 250299153 165391463 9 10185866 19751997 49514561 66638484 43733790 10 6825621 14936083 37937741 49413212 31949765 11 29762500 55254306 133879416 181288004 118358977 12 5474475 11043533 25870578 33351106 19000303
df_stack. plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x22b9c9941d0>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Wbv5G0Qk-1601386327108)(output_7_1.png)]
df_group. plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x22b9d770470>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-2Pf0Amnx-1601386327112)(output_8_1.png)]
df_01 = df_group. reset_index( )
df_01. head( )
pdate Rating pv 0 1 1 2613452 1 1 2 5294359 2 1 3 12273331 3 1 4 16452340 4 1 5 9580889
df_pivot = df_01. pivot( 'pdate' , 'Rating' , 'pv' )
df_pivot. head( )
Rating 1 2 3 4 5 pdate 1 2613452 5294359 12273331 16452340 9580889 2 1608795 3747792 8608216 10941690 5819374 3 1344378 2772172 6156512 6911661 3228406 4 4397883 10147641 25330546 30406281 18070625 5 24893943 41021601 99671422 138699142 98385825
df_pivot. plot( )
<matplotlib.axes._subplots.AxesSubplot at 0x22b9ca55748>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3RdvEtTa-1601386327122)(output_11_1.png)]
'''
#stack简单说就是把columns转成index
#unstack就是把index转成columns
stack语法:
DataFrame.stack(level = -1,dropna=True),level = -1 表示多层索引的最内层(注意是负一),可以通过==0、1、2指定多层索引的对应层
pivot语法:
DataFrame.pivot(index = None,columns = None,values = None),指定了三个参数,如果不写index。。。就默认按ratings.csv顺序
'''