#(1)怎样对数值列按照分组的归一化
#(2)怎样取每个分组的TOP N数据
‘’’
X的归一化:
X(normalized) = (X当前值-最小值)/(X最大值-X最小值)
‘’’
import pandas as pd
import os
os.chdir(r"C:\Users\Hans\Desktop\data_analysis\test_data\movie")
ratings = pd.read_csv("ratings.dat",sep = "::",engine = "python",names = "UserID::MovieID::Rating::Timestamp".split("::"))
ratings.head()
|
UserID |
MovieID |
Rating |
Timestamp |
0 |
1 |
1193 |
5 |
978300760 |
1 |
1 |
661 |
3 |
978302109 |
2 |
1 |
914 |
3 |
978301968 |
3 |
1 |
3408 |
4 |
978300275 |
4 |
1 |
2355 |
5 |
978824291 |
def ratings_norm(df):
min_value = df["Rating"].min()
max_value = df["Rating"].max()
df["Rating_norm"] = df["Rating"].apply(lambda x:(x-min_value)/(max_value-min_value))
return df
a = ratings.groupby("UserID").apply(ratings_norm)
a.head()
|
UserID |
MovieID |
Rating |
Timestamp |
Rating_norm |
0 |
1 |
1193 |
5 |
978300760 |
1.0 |
1 |
1 |
661 |
3 |
978302109 |
0.0 |
2 |
1 |
914 |
3 |
978301968 |
0.0 |
3 |
1 |
3408 |
4 |
978300275 |
0.5 |
4 |
1 |
2355 |
5 |
978824291 |
1.0 |
ratings["Rating"].unique()
array([5, 3, 4, 2, 1], dtype=int64)
b = ratings.loc[ratings["UserID"]==1]
b.tail()
|
UserID |
MovieID |
Rating |
Timestamp |
48 |
1 |
2028 |
5 |
978301619 |
49 |
1 |
531 |
4 |
978302149 |
50 |
1 |
3114 |
4 |
978302174 |
51 |
1 |
608 |
4 |
978301398 |
52 |
1 |
1246 |
4 |
978302091 |
b["Rating"].unique()
array([5, 3, 4], dtype=int64)
ratings.groupby("UserID").apply(ratings_norm).head()
|
UserID |
MovieID |
Rating |
Timestamp |
Rating_norm |
0 |
1 |
1193 |
5 |
978300760 |
1.0 |
1 |
1 |
661 |
3 |
978302109 |
0.0 |
2 |
1 |
914 |
3 |
978301968 |
0.0 |
3 |
1 |
3408 |
4 |
978300275 |
0.5 |
4 |
1 |
2355 |
5 |
978824291 |
1.0 |
#这里注意,按照UserID分组后,UserID==1的组,对应的RatIng只有三个值,5,3,4
file_path = r"C:\Users\Hans\Desktop\data_analysis\test_data\Beijing_2014.csv"
df = pd.read_csv(file_path)
df.head()
|
Date |
Temperature(Celsius)(high) |
Temperature(Celsius)(avg) |
Temperature(Celsius)(low) |
Dew Point(Celsius)(high) |
Dew Point(Celsius)(avg) |
Dew Point(Celsius)(low) |
Humidity(%)(high) |
Humidity(%)(avg) |
Humidity(%)(low) |
... |
Visibility(km)(high) |
Visibility(km)(avg) |
Visibility(km)(low) |
Wind(km/h)(high) |
Wind(km/h)(avg) |
Precipitation(mm)(high) |
Precipitation(mm)(sum) |
weather1 |
weather2 |
weather3 |
0 |
2014-1-1 |
12 |
4 |
-2 |
-12 |
-17 |
-20 |
44 |
19 |
7 |
... |
31 |
11 |
5 |
32 |
16 |
47 |
0.0 |
NaN |
NaN |
NaN |
1 |
2014-1-2 |
7 |
0 |
-6 |
-6 |
-9 |
-13 |
74 |
50 |
28 |
... |
18 |
7 |
3 |
18 |
5 |
- |
0.0 |
NaN |
NaN |
NaN |
2 |
2014-1-3 |
9 |
3 |
-2 |
-7 |
-13 |
-18 |
64 |
32 |
9 |
... |
31 |
13 |
5 |
18 |
8 |
- |
0.0 |
NaN |
NaN |
NaN |
3 |
2014-1-4 |
2 |
-2 |
-6 |
-4 |
-7 |
-9 |
80 |
68 |
44 |
... |
10 |
5 |
2 |
11 |
5 |
- |
0.0 |
NaN |
NaN |
NaN |
4 |
2014-1-5 |
7 |
0 |
-7 |
-5 |
-11 |
-15 |
80 |
51 |
15 |
... |
31 |
7 |
3 |
18 |
10 |
- |
0.0 |
NaN |
NaN |
NaN |
5 rows × 23 columns
from datetime import datetime
m = pd.to_datetime(df[</