pandas
电影喜爱分析
多个framedata合并
pd.merge(ratings, users) 合并相同类的 pd.concat([ratings,users,movies],axis=1) 横向不合并相同类的 meand=data.table(‘rating’,index=‘title’,columns=‘gender’,aggfunc=‘mean’) 取平均值,序列按index排列 atings_by_title = data.groupby(‘title’).size() 统计title中相同的个数 active_titles = ratings_by_title.index[ratings_by_title >= 250] 索引布尔值为真数据
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
pd. options. display. max_rows = 10
unames = [ 'user_id' , 'gender' , 'age' , 'occupation' , 'zip' ]
users = pd. read_table( r'C:\Users\Administrator\Desktop\网络模型\ml-1m\users.dat' , sep= '::' , header= None , names= unames)
rnames = [ 'user_id' , 'movie_id' , 'rating' , 'timestamp' ]
ratings = pd. read_table( r'C:\Users\Administrator\Desktop\网络模型\ml-1m\ratings.dat' , sep= '::' , header= None , names= rnames)
mnames = [ 'movie_id' , 'title' , 'genres' ]
movies = pd. read_table( r'C:\Users\Administrator\Desktop\网络模型\ml-1m\movies.dat' , sep= '::' , header= None , names= mnames)
E:\anaconda\envs\yolo\lib\site-packages\ipykernel_launcher.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
E:\anaconda\envs\yolo\lib\site-packages\ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
after removing the cwd from sys.path.
E:\anaconda\envs\yolo\lib\site-packages\ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
users[ : 3 ]
user_id gender age occupation zip 0 1 F 1 10 48067 1 2 M 56 16 70072 2 3 M 25 15 55117
ratings[ : 3 ]
user_id movie_id rating timestamp 0 1 1193 5 978300760 1 1 661 3 978302109 2 1 914 3 978301968
movies[ : 3 ]
movie_id title genres 0 1 Toy Story (1995) Animation|Children's|Comedy 1 2 Jumanji (1995) Adventure|Children's|Fantasy 2 3 Grumpier Old Men (1995) Comedy|Romance
data= pd. merge( pd. merge( ratings, users) , movies)
pd. concat( [ ratings, users, movies] , axis= 1 ) [ : 3 ]
user_id movie_id rating timestamp user_id gender age occupation zip movie_id title genres 0 1 1193 5 978300760 1.0 F 1.0 10.0 48067 1.0 Toy Story (1995) Animation|Children's|Comedy 1 1 661 3 978302109 2.0 M 56.0 16.0 70072 2.0 Jumanji (1995) Adventure|Children's|Fantasy 2 1 914 3 978301968 3.0 M 25.0 15.0 55117 3.0 Grumpier Old Men (1995) Comedy|Romance
data[ : 3 ]
user_id movie_id rating timestamp gender age occupation zip title genres 0 1 1193 5 978300760 F 1 10 48067 One Flew Over the Cuckoo's Nest (1975) Drama 1 2 1193 5 978298413 M 56 16 70072 One Flew Over the Cuckoo's Nest (1975) Drama 2 12 1193 4 978220179 M 25 12 32793 One Flew Over the Cuckoo's Nest (1975) Drama
mean_ratings= data. pivot_table( 'rating' , index= 'title' , columns= 'gender' , aggfunc= 'mean' )
mean_ratings[ : 5 ]
gender F M title $1,000,000 Duck (1971) 3.375000 2.761905 'Night Mother (1986) 3.388889 3.352941 'Til There Was You (1997) 2.675676 2.733333 'burbs, The (1989) 2.793478 2.962085 ...And Justice for All (1979) 3.828571 3.689024
ratings_by_title = data. groupby( 'title' ) . size( )
ratings_by_title[ : 4 ]
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
dtype: int64
active_titles = ratings_by_title. index[ ratings_by_title >= 250 ]
pd. Series( active_titles) [ : 5 ]
0 'burbs, The (1989)
1 10 Things I Hate About You (1999)
2 101 Dalmatians (1961)
3 101 Dalmatians (1996)
4 12 Angry Men (1957)
Name: title, dtype: object
mean_ratings = mean_ratings. loc[ active_titles]
mean_ratings[ : 5 ]
gender F M title 'burbs, The (1989) 2.793478 2.962085 10 Things I Hate About You (1999) 3.646552 3.311966 101 Dalmatians (1961) 3.791444 3.500000 101 Dalmatians (1996) 3.240000 2.911215 12 Angry Men (1957) 4.184397 4.328421
top_female_ratings= mean_ratings. sort_values( by= 'F' , ascending= False )
top_female_ratings[ : 4 ]
gender F M different title Close Shave, A (1995) 4.644444 4.473795 -0.170650 Wrong Trousers, The (1993) 4.588235 4.478261 -0.109974 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589 -0.108060 Wallace & Gromit: The Best of Aardman Animation (1996) 4.563107 4.385075 -0.178032
mean_ratings[ 'different' ] = mean_ratings[ 'M' ] - mean_ratings[ 'F' ]
sorted_by_diff = mean_ratings. sort_values( by= 'different' )
sorted_by_diff[ : 4 ]
gender F M different title Dirty Dancing (1987) 3.790378 2.959596 -0.830782 Jumpin' Jack Flash (1986) 3.254717 2.578358 -0.676359 Grease (1978) 3.975265 3.367041 -0.608224 Little Women (1994) 3.870588 3.321739 -0.548849
rating_std_by_title = data. groupby( 'title' ) [ 'rating' ] . std( )
rating_std_by_title = rating_std_by_title. loc[ active_titles]
rating_std_by_title. sort_values( ascending= False ) [ : 5 ]
title
Dumb & Dumber (1994) 1.321333
Blair Witch Project, The (1999) 1.316368
Natural Born Killers (1994) 1.307198
Tank Girl (1995) 1.277695
Rocky Horror Picture Show, The (1975) 1.260177
Name: rating, dtype: float64