初始操作
数据读取
- 输入
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
# 数据读取
labels = ['UserId','Gender','Age','Occupation','zip-code']
users = pd.read_csv('./users.dat',sep = '::',header = None,names = labels)
users.shape
labels = ['movieId','Title','Genres']
movie = pd.read_csv('./movies.dat',sep = '::',header = None, names = labels)
display(movie.head(),movie.shape)
labels = ['UserId','MovieId','Rating','Time']
ratings = pd.read_csv('./ratings.dat',sep = '::',header = None, names = labels)
display(ratings.head(),ratings.shape)
labels = ['UserId','MovieId','Rating','Time']
ratings = pd.read_csv('./ratings.dat',sep = '::',header = None, names = labels)
display(ratings.head(),ratings.shape)
- 输出
(6040, 5)
movieId Title Genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
UserId MovieId Rating Time
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
(1000209, 4)
数据集成
数据合并,数据分布于三个表,数据合并专业词汇:数据集成
- 输入
display(users.head(),movie.head(),ratings.head())
df1 = pd.merge(left = users,right = ratings)
df1.head()
movie_data = pd.merge(movie,df1,left_on = 'movieId',right_on = 'MovieId')
display(movie_data.shape,movie_data.head())
movie_data['Age'].unique()
movie_data.shape
movie_data.head()
movie_data['Title'].unique()
movie_data['Title'].unique().size
- 输出
UserId Gender Age Occupation zip-code
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
movieId Title Genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
UserId MovieId Rating Time
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
UserId Gender Age Occupation zip-code MovieId Rating Time
0 1 F 1 10 48067 1193 5 978300760
1 1 F 1 10 48067 661 3 978302109
2 1 F 1 10 48067 914 3 978301968
3 1 F 1 10 48067 3408 4 978300275
4 1 F 1 10 48067 2355 5 978824291
(1000209, 11)
movieId Title Genres UserId Gender Age Occupation zip-code MovieId Rating Time
0 1 Toy Story (1995) Animation|Children's|Comedy 1 F 1 10 48067 1 5 978824268
1 1 Toy Story (1995) Animation|Children's|Comedy 6 F 50 9 55117 1 4 978237008
2 1 Toy Story (1995) Animation|Children's|Comedy 8 M 25 12 11413 1 4 978233496
3 1 Toy Story (1995) Animation|Children's|Comedy 9 M 25 17 61614 1 5 978225952
4 1 Toy Story (1995) Animation|Children's|Comedy 10 F 35 1 95370 1 5 978226474
array([ 1, 50, 25, 35, 18, 45, 56], dtype=int64)
(1000209, 11)
movieId Title Genres UserId Gender Age Occupation zip-code MovieId Rating Time
0 1 Toy Story (1995) Animation|Children's|Comedy 1 F 1 10 48067 1 5 978824268
1 1 Toy Story (1995) Animation|Children's|Comedy 6 F 50 9 55117 1 4 978237008
2 1 Toy Story (1995) Animation|Children's|Comedy 8 M 25 12 11413 1 4 978233496
3 1 Toy Story (1995) Animation|Children's|Comedy 9 M 25 17 61614 1 5 978225952
4 1 Toy Story (1995) Animation|Children's|Comedy 10 F 35 1 95370 1 5 978226474
array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
..., 'Tigerland (2000)', 'Two Family House (2000)',
'Contender, The (2000)'], dtype=object)
3706
用透视表对电影进行数据分析
选取评分高或低的电影
- 输入
movie_rate_mean = pd.pivot_table(movie_data,values = ['Rating'],index = ['Title'],aggfunc = 'mean')
movie_rate_mean.shape
movie_rate_mean.head()
movie_rate_mean.sort_values(by = 'Rating',ascending = False,inplace = True)
# 选取评分最高的电影
movie_rate_mean[:20]
# 选取评分最低的电影
movie_rate_mean[-20:]
- 输出
(3706, 1)
Rating
Title
$1,000,000 Duck (1971) 3.027027
'Night Mother (1986) 3.371429
'Til There Was You (1997) 2.692308
'burbs, The (1989) 2.910891
...And Justice for All (1979) 3.713568
Rating
Title
Ulysses (Ulisse) (1954) 5.000000
Lured (1947) 5.000000
Follow the Bitch (1998) 5.000000
Bittersweet Motel (2000) 5.000000
Song of Freedom (1936) 5.000000
One Little Indian (1973) 5.000000
Smashing Time (1967) 5.000000
Schlafes Bruder