# MovieLens 1M数据集 稍微过了下数据分析这本书,最后再把前面第二章例子敲一遍,不然总是记不住
import pandas as pd
import numpy as np
from pandas import DataFrame
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('D:\\pytest\\pydata-book-master\\ch02\\movielens\\users.dat', sep = '::',header = None, names = unames )
users[:5]
C:\software\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: ParserWarning: Falling back to the ‘python’ engine because the ‘c’ engine does not support regex separators (separators > 1 char and different from ‘\s+’ are interpreted as regex); you can avoid this warning by specifying engine=’python’.
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('D:\\pytest\\pydata-book-master\\ch02\\movielens\\ratings.dat', sep = '::', header = None, names = rnames)
ratings[:5]
C:\software\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: ParserWarning: Falling back to the ‘python’ engine because the ‘c’ engine does not support regex separators (separators > 1 char and different from ‘\s+’ are interpreted as regex); you can avoid this warning by specifying engine=’python’.
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('D:\\pytest\\pydata-book-master\\ch02\\movielens\\movies.dat', sep = '::', header = None, names = mnames)
movies[:5]
C:\software\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: ParserWarning: Falling back to the ‘python’ engine because the ‘c’ engine does not support regex separators (separators > 1 char and different from ‘\s+’ are interpreted as regex); you can avoid this warning by specifying engine=’python’.
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children’s|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children’s|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
ratings.info()
users.info()
movies.info()
data = pd.merge(pd.merge(ratings, movies), users)
data[:5]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
user_id | movie_id | rating | timestamp | title | genres | gender | age | occupation | zip | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | One Flew Over the Cuckoo’s Nest (1975) | Drama | F | 1 | 10 | 48067 |
1 | 1 | 661 | 3 | 978302109 | James and the Giant Peach (1996) | Animation|Children’s|Musical | F | 1 | 10 | 48067 |
2 | 1 | 914 | 3 | 978301968 | My Fair Lady (1964) | Musical|Romance | F | 1 | 10 | 48067 |
3 | 1 | 3408 | 4 | 978300275 | Erin Brockovich (2000) | Drama | F | 1 | 10 | 48067 |
4 | 1 | 2355 | 5 | 978824291 | Bug’s Life, A (1998) | Animation|Children’s|Comedy | F | 1 | 10 | 48067 |
data.info()
data.iloc[1] #iloc和loc方法是取行,ix方法已经丢弃不用了
user_id 1 movie_id 661 rating 3 timestamp 978302109 title James and the Giant Peach (1996) genres Animation|Children’s|Musical gender F age 1 occupation 10 zip 48067 Name: 1, dtype: object 根据rating里男女评分,对一一部电影取平均分,用了pivot_table方法,在书上的p288页有详细介绍
mean_ratings = data.pivot_table('rating', index = 'title', columns = 'gender', aggfunc = 'mean')
mean_ratings[:5]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
‘Night Mother (1986) | 3.388889 | 3.352941 |
‘Til There Was You (1997) | 2.675676 | 2.733333 |
‘burbs, The (1989) | 2.793478 | 2.962085 |
…And Justice for All (1979) | 3.828571 | 3.689024 |
将整个data按照title排序,例如:含有‘$1,000,000 Duck (1971)’这部电影共有37条行数据,说明有37个人对其评价了,现在打算将评价数低于250的都删除掉,首先要找出评价数低于250的数据在rating_by_title中的行索引
rating_by_title = data.groupby('title').size()
rating_by_title[:5]
title $1,000,000 Duck (1971) 37 ‘Night Mother (1986) 70 ‘Til There Was You (1997) 52 ‘burbs, The (1989) 303 …And Justice for All (1979) 199 dtype: int64 取出评论超过250条的电影
active_titles = rating_by_title.index[rating_by_title >= 250]
active_titles
Index([”burbs, The (1989)’, ‘10 Things I Hate About You (1999)’, ‘101 Dalmatians (1961)’, ‘101 Dalmatians (1996)’, ‘12 Angry Men (1957)’, ‘13th Warrior, The (1999)’, ‘2 Days in the Valley (1996)’, ‘20,000 Leagues Under the Sea (1954)’, ‘2001: A Space Odyssey (1968)’, ‘2010 (1984)’, … ‘X-Men (2000)’, ‘Year of Living Dangerously (1982)’, ‘Yellow Submarine (1968)’, ‘You’ve Got Mail (1998)’, ‘Young Frankenstein (1974)’, ‘Young Guns (1988)’, ‘Young Guns II (1990)’, ‘Young Sherlock Holmes (1985)’, ‘Zero Effect (1998)’, ‘eXistenZ (1999)’], dtype=’object’, name=’title’, length=1216)
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings[:10]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
gender | F | M |
---|---|---|
title | ||
‘burbs, The (1989) | 2.793478 | 2.962085 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 |
101 Dalmatians (1961) | 3.791444 | 3.500000 |
101 Dalmatians (1996) | 3.240000 | 2.911215 |
12 Angry Men (1957) | 4.184397 | 4.328421 |
13th Warrior, The (1999) | 3.112000 | 3.168000 |
2 Days in the Valley (1996) | 3.488889 | 3.244813 |
20,000 Leagues Under the Sea (1954) | 3.670103 | 3.709205 |
2001: A Space Odyssey (1968) | 3.825581 | 4.129738 |
2010 (1984) | 3.446809 | 3.413712 |
mean_ratings.info()
top_female_ratings = mean_ratings.sort_values(by = 'F', ascending = False) #ascending 为升降序参数
top_female_ratings[:10]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
### 计算评分分歧
gender | F | M |
---|---|---|
title | ||
Close Shave, A (1995) | 4.644444 | 4.473795 |
Wrong Trousers, The (1993) | 4.588235 | 4.478261 |
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | 4.572650 | 4.464589 |
Wallace & Gromit: The Best of Aardman Animation (1996) | 4.563107 | 4.385075 |
Schindler’s List (1993) | 4.562602 | 4.491415 |
Shawshank Redemption, The (1994) | 4.539075 | 4.560625 |
Grand Day Out, A (1992) | 4.537879 | 4.293255 |
To Kill a Mockingbird (1962) | 4.536667 | 4.372611 |
Creature Comforts (1990) | 4.513889 | 4.272277 |
Usual Suspects, The (1995) | 4.513317 | 4.518248 |
mean_ratings['diff'] = mean_ratings['F'] - mean_ratings['M']
mean_ratings[:15]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
gender | F | M | diff |
---|---|---|---|
title | |||
‘burbs, The (1989) | 2.793478 | 2.962085 | -0.168607 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 | 0.334586 |
101 Dalmatians (1961) | 3.791444 | 3.500000 | 0.291444 |
101 Dalmatians (1996) | 3.240000 | 2.911215 | 0.328785 |
12 Angry Men (1957) | 4.184397 | 4.328421 | -0.144024 |
13th Warrior, The (1999) | 3.112000 | 3.168000 | -0.056000 |
2 Days in the Valley (1996) | 3.488889 | 3.244813 | 0.244076 |
20,000 Leagues Under the Sea (1954) | 3.670103 | 3.709205 | -0.039102 |
2001: A Space Odyssey (1968) | 3.825581 | 4.129738 | -0.304156 |
2010 (1984) | 3.446809 | 3.413712 | 0.033097 |
28 Days (2000) | 3.209424 | 2.977707 | 0.231717 |
39 Steps, The (1935) | 3.965517 | 4.107692 | -0.142175 |
54 (1998) | 2.701754 | 2.782178 | -0.080424 |
7th Voyage of Sinbad, The (1958) | 3.409091 | 3.658879 | -0.249788 |
8MM (1999) | 2.906250 | 2.850962 | 0.055288 |
前15个是女性比较喜爱但是男性不喜爱的电影,分歧最大的15部
diff_ratings = mean_ratings.sort_values(by = 'diff', ascending = False)
diff_ratings[:15]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
gender | F | M | diff |
---|---|---|---|
title | |||
Dirty Dancing (1987) | 3.790378 | 2.959596 | 0.830782 |
Jumpin’ Jack Flash (1986) | 3.254717 | 2.578358 | 0.676359 |
Grease (1978) | 3.975265 | 3.367041 | 0.608224 |
Little Women (1994) | 3.870588 | 3.321739 | 0.548849 |
Steel Magnolias (1989) | 3.901734 | 3.365957 | 0.535777 |
Anastasia (1997) | 3.800000 | 3.281609 | 0.518391 |
Rocky Horror Picture Show, The (1975) | 3.673016 | 3.160131 | 0.512885 |
Color Purple, The (1985) | 4.158192 | 3.659341 | 0.498851 |
Age of Innocence, The (1993) | 3.827068 | 3.339506 | 0.487561 |
Free Willy (1993) | 2.921348 | 2.438776 | 0.482573 |
French Kiss (1995) | 3.535714 | 3.056962 | 0.478752 |
Little Shop of Horrors, The (1960) | 3.650000 | 3.179688 | 0.470312 |
Guys and Dolls (1955) | 4.051724 | 3.583333 | 0.468391 |
Mary Poppins (1964) | 4.197740 | 3.730594 | 0.467147 |
Patch Adams (1998) | 3.473282 | 3.008746 | 0.464536 |
倒序后取15个是男性喜爱而女性不喜爱的电影前15
diff_ratings[::-1][:15]
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
gender | F | M | diff |
---|---|---|---|
title | |||
Good, The Bad and The Ugly, The (1966) | 3.494949 | 4.221300 | -0.726351 |
Kentucky Fried Movie, The (1977) | 2.878788 | 3.555147 | -0.676359 |
Dumb & Dumber (1994) | 2.697987 | 3.336595 | -0.638608 |
Longest Day, The (1962) | 3.411765 | 4.031447 | -0.619682 |
Cable Guy, The (1996) | 2.250000 | 2.863787 | -0.613787 |
Evil Dead II (Dead By Dawn) (1987) | 3.297297 | 3.909283 | -0.611985 |
Hidden, The (1987) | 3.137931 | 3.745098 | -0.607167 |
Rocky III (1982) | 2.361702 | 2.943503 | -0.581801 |
Caddyshack (1980) | 3.396135 | 3.969737 | -0.573602 |
For a Few Dollars More (1965) | 3.409091 | 3.953795 | -0.544704 |
Porky’s (1981) | 2.296875 | 2.836364 | -0.539489 |
Animal House (1978) | 3.628906 | 4.167192 | -0.538286 |
Exorcist, The (1973) | 3.537634 | 4.067239 | -0.529605 |
Fright Night (1985) | 2.973684 | 3.500000 | -0.526316 |
Barb Wire (1996) | 1.585366 | 2.100386 | -0.515020 |
只找出分歧最大的电影,用方差或标准差计算
根据电影名称的评分,对每个电影计算标准差
rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title[:15]
title
$1,000,000 Duck (1971) 1.092563
'Night Mother (1986) 1.118636
'Til There Was You (1997) 1.020159
'burbs, The (1989) 1.107760
...And Justice for All (1979) 0.878110
1-900 (1994) 0.707107
10 Things I Hate About You (1999) 0.989815
101 Dalmatians (1961) 0.982103
101 Dalmatians (1996) 1.098717
12 Angry Men (1957) 0.812731
13th Warrior, The (1999) 1.140421
187 (1997) 1.057919
2 Days in the Valley (1996) 0.921592
20 Dates (1998) 1.151943
20,000 Leagues Under the Sea (1954) 0.869685
Name: rating, dtype: float64
然后行过滤掉评价人数小于250人的电影
rating_std_by_title = rating_std_by_title.loc[active_titles]
最后降序排列得出分歧最大的电影
rating_std_by_title.sort_values(ascending = False)[:15]
title
Dumb & Dumber (1994) 1.321333
Blair Witch Project, The (1999) 1.316368
Natural Born Killers (1994) 1.307198
Tank Girl (1995) 1.277695
Rocky Horror Picture Show, The (1975) 1.260177
Eyes Wide Shut (1999) 1.259624
Evita (1996) 1.253631
Billy Madison (1995) 1.249970
Fear and Loathing in Las Vegas (1998) 1.246408
Bicentennial Man (1999) 1.245533
Hellraiser (1987) 1.243046
Babe: Pig in the City (1998) 1.239379
Wes Craven's New Nightmare (1994) 1.237630
South Park: Bigger, Longer and Uncut (1999) 1.235380
Deuce Bigalow: Male Gigolo (1999) 1.226337
Name: rating, dtype: float64
rating_std_by_title.order(ascending = False)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-89-e2202e0e8762> in <module>()
----> 1 rating_std_by_title.order(ascending = False)
C:\software\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
2968 if name in self._info_axis:
2969 return self[name]
-> 2970 return object.__getattribute__(self, name)
2971
2972 def __setattr__(self, name, value):
AttributeError: 'Series' object has no attribute 'order'
关于排序,Series 有order方法,DataFrame有sort_value方法,为什么order不能用了?
还有,索引排序,还有sort_index方法