import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
导入数据
filename ='./data/kobe.csv'
raw = pd.read_csv(filename)print(raw.shape)print(raw.head())
(30697, 25)
action_type combined_shot_type game_event_id game_id lat \
0 Jump Shot Jump Shot 10 20000012 33.9723
1 Jump Shot Jump Shot 12 20000012 34.0443
2 Jump Shot Jump Shot 35 20000012 33.9093
3 Jump Shot Jump Shot 43 20000012 33.8693
4 Driving Dunk Shot Dunk 155 20000012 34.0443
loc_x loc_y lon minutes_remaining period ... shot_type \
0 167 72 -118.1028 10 1 ... 2PT Field Goal
1 -157 0 -118.4268 10 1 ... 2PT Field Goal
2 -101 135 -118.3708 7 1 ... 2PT Field Goal
3 138 175 -118.1318 6 1 ... 2PT Field Goal
4 0 0 -118.2698 6 2 ... 2PT Field Goal
shot_zone_area shot_zone_basic shot_zone_range team_id \
0 Right Side(R) Mid-Range 16-24 ft. 1610612747
1 Left Side(L) Mid-Range 8-16 ft. 1610612747
2 Left Side Center(LC) Mid-Range 16-24 ft. 1610612747
3 Right Side Center(RC) Mid-Range 16-24 ft. 1610612747
4 Center(C) Restricted Area Less Than 8 ft. 1610612747
team_name game_date matchup opponent shot_id
0 Los Angeles Lakers 2000-10-31 LAL @ POR POR 1
1 Los Angeles Lakers 2000-10-31 LAL @ POR POR 2
2 Los Angeles Lakers 2000-10-31 LAL @ POR POR 3
3 Los Angeles Lakers 2000-10-31 LAL @ POR POR 4
4 Los Angeles Lakers 2000-10-31 LAL @ POR POR 5
[5 rows x 25 columns]
Center(C) 11289
Right Side Center(RC) 3981
Right Side(R) 3859
Left Side Center(LC) 3364
Left Side(L) 3132
Back Court(BC) 72
Name: shot_zone_area, dtype: int64
6
drops =['shot_id','team_id','team_name','shot_zone_area','shot_zone_range','shot_zone_basic', \
'matchup','lon','lat','seconds_remaining','minutes_remaining', \
'shot_distance','loc_x','loc_y','game_event_id','game_id','game_date']for drop in drops:
raw = raw.drop(drop,1)
Jump Shot 23485
Layup 5448
Dunk 1286
Tip Shot 184
Hook Shot 153
Bank Shot 141
Name: combined_shot_type, dtype: int64
combined_shot_type_Bank Shot
combined_shot_type_Dunk
combined_shot_type_Hook Shot
combined_shot_type_Jump Shot
combined_shot_type_Layup
combined_shot_type_Tip Shot
0
0
0
0
1
0
0
1
0
0
0
1
0
0
categorical_vars =['action_type','combined_shot_type','shot_type','opponent','period','season']for var in categorical_vars:
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)],1)
raw = raw.drop(var,1)
# find the best n_estimators for RandomForestClassifierfrom sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
print('Finding best n_estimators for RandomForestClassifier...')
min_score =100000
best_n =0
scores_n =[]
range_n = np.logspace(1,2, num=8).astype(int)for n in range_n:print("the number of trees : {0}".format(n))
t1 = time.time()
rfc_score =0.
rfc = RandomForestClassifier(n_estimators=n)
kf = KFold(n_splits=5, shuffle=True, random_state=40).split(train_kobe)for train_k, test_k in kf:
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred)/10
scores_n.append(rfc_score)if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2 - t1))print(best_n, min_score)# find best max_depth for RandomForestClassifierprint('Finding best max_depth for RandomForestClassifier...')
min_score =100000
best_m =0
scores_m =[]
range_m = np.logspace(0,2,num=8).astype(int)for m in range_m:print("the max depth : {0}".format(m))
t1 = time.time()
rfc_score =0.
rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
kf = KFold(n_splits=5, shuffle=True, random_state=40).split(train_kobe)for train_k, test_k in kf:
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])#rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred)/10
scores_m.append(rfc_score)if rfc_score < min_score:
min_score = rfc_score
best_m = m
t2 = time.time()print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2 - t1))print(best_m, min_score)
Finding best n_estimators for RandomForestClassifier...
the number of trees : 10
Done processing 10 trees (2.832sec)
the number of trees : 13
Done processing 13 trees (3.499sec)
the number of trees : 19
Done processing 19 trees (5.482sec)
the number of trees : 26
Done processing 26 trees (6.886sec)
the number of trees : 37
Done processing 37 trees (10.694sec)
the number of trees : 51
Done processing 51 trees (14.202sec)
the number of trees : 71
Done processing 71 trees (18.946sec)
the number of trees : 100
Done processing 100 trees (25.590sec)
100 5.898521466994053
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (2.554sec)
the max depth : 1
Done processing 1 trees (2.581sec)
the max depth : 3
Done processing 3 trees (3.824sec)
the max depth : 7
Done processing 7 trees (6.487sec)
the max depth : 13
Done processing 13 trees (10.639sec)
the max depth : 26
Done processing 26 trees (18.684sec)
the max depth : 51
Done processing 51 trees (27.133sec)
the max depth : 100
Done processing 100 trees (28.496sec)
13 5.504022676997401