科比生涯数据集分析与预测

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
filename = 'data.csv'
raw = pd.read_csv(filename)
print(raw.shape)
raw.head()
(30697, 25)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
action_typecombined_shot_typegame_event_idgame_idlatloc_xloc_ylonminutes_remainingperiodshot_typeshot_zone_areashot_zone_basicshot_zone_rangeteam_idteam_namegame_datematchupopponentshot_id
0Jump ShotJump Shot102000001233.972316772-118.10281012PT Field GoalRight Side(R)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR1
1Jump ShotJump Shot122000001234.0443-1570-118.42681012PT Field GoalLeft Side(L)Mid-Range8-16 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR2
2Jump ShotJump Shot352000001233.9093-101135-118.3708712PT Field GoalLeft Side Center(LC)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR3
3Jump ShotJump Shot432000001233.8693138175-118.1318612PT Field GoalRight Side Center(RC)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR4
4Driving Dunk ShotDunk1552000001234.044300-118.2698622PT Field GoalCenter(C)Restricted AreaLess Than 8 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR5

5 rows × 25 columns

pd.notnull(raw['shot_made_flag'])
0 False 1 True 2 True 3 True 4 True 5 True 6 True 7 False 8 True 9 True 10 True 11 True 12 True 13 True 14 True 15 True 16 False 17 True 18 True 19 False 20 True 21 True 22 True 23 True 24 True 25 True 26 True 27 True 28 True 29 True … 30667 True 30668 False 30669 True 30670 True 30671 True 30672 True 30673 True 30674 True 30675 True 30676 True 30677 True 30678 True 30679 True 30680 False 30681 True 30682 False 30683 True 30684 True 30685 True 30686 False 30687 True 30688 True 30689 True 30690 True 30691 True 30692 True 30693 False 30694 True 30695 True 30696 True Name: shot_made_flag, Length: 30697, dtype: bool
#
kebo = raw[pd.notnull(raw['shot_made_flag'])]
print(kebo.shape)
kebo.head()
(25697, 25)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
action_typecombined_shot_typegame_event_idgame_idlatloc_xloc_ylonminutes_remainingperiodshot_typeshot_zone_areashot_zone_basicshot_zone_rangeteam_idteam_namegame_datematchupopponentshot_id
1Jump ShotJump Shot122000001234.0443-1570-118.42681012PT Field GoalLeft Side(L)Mid-Range8-16 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR2
2Jump ShotJump Shot352000001233.9093-101135-118.3708712PT Field GoalLeft Side Center(LC)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR3
3Jump ShotJump Shot432000001233.8693138175-118.1318612PT Field GoalRight Side Center(RC)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR4
4Driving Dunk ShotDunk1552000001234.044300-118.2698622PT Field GoalCenter(C)Restricted AreaLess Than 8 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR5
5Jump ShotJump Shot2442000001234.0553-145-11-118.4148932PT Field GoalLeft Side(L)Mid-Range8-16 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR6

5 rows × 25 columns

alpha = 0.02
plt.figure(figsize=(15,15))

plt.subplot(121)
plt.scatter(kebo.loc_x,kebo.loc_y,color='R',alpha=alpha)
plt.title('loc_x and loc_y')

plt.subplot(122)
plt.scatter(kebo.lon,kebo.lat,color='B',alpha=alpha)
plt.title('lat and lon')
Text(0.5,1,’lat and lon’) ![png](output_4_1.png)
raw['dist'] = np.sqrt(raw['loc_x']**2+raw['loc_y']**2)
raw.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
action_typecombined_shot_typegame_event_idgame_idlatloc_xloc_ylonminutes_remainingperiodshot_zone_areashot_zone_basicshot_zone_rangeteam_idteam_namegame_datematchupopponentshot_iddist
0Jump ShotJump Shot102000001233.972316772-118.1028101Right Side(R)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR1181.859836
1Jump ShotJump Shot122000001234.0443-1570-118.4268101Left Side(L)Mid-Range8-16 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR2157.000000
2Jump ShotJump Shot352000001233.9093-101135-118.370871Left Side Center(LC)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR3168.600119
3Jump ShotJump Shot432000001233.8693138175-118.131861Right Side Center(RC)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR4222.865430
4Driving Dunk ShotDunk1552000001234.044300-118.269862Center(C)Restricted AreaLess Than 8 ft.1610612747Los Angeles Lakers2000-10-31LAL @ PORPOR50.000000

5 rows × 26 columns

loc_x_zero = raw['loc_x']==0
raw['angle'] = np.array([0]*len(raw))
#raw.head()
# ~loc_x_zero:取反
raw['angle'][~loc_x_zero]=np.arctan(raw['loc_y'][~loc_x_zero]/raw['loc_y'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi/2
/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy “”” /home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
raw['remaining_time'] = raw['minutes_remaining']*60+raw['seconds_remaining']
#当前列里面有多少不重复的值
print(kebo.action_type.unique())
print(kebo.combined_shot_type.unique())
print(kebo['shot_type'].unique())
print(kebo.shot_type.value_counts())
[‘Jump Shot’ ‘Driving Dunk Shot’ ‘Layup Shot’ ‘Running Jump Shot’ ‘Reverse Dunk Shot’ ‘Slam Dunk Shot’ ‘Driving Layup Shot’ ‘Turnaround Jump Shot’ ‘Reverse Layup Shot’ ‘Tip Shot’ ‘Running Hook Shot’ ‘Alley Oop Dunk Shot’ ‘Dunk Shot’ ‘Alley Oop Layup shot’ ‘Running Dunk Shot’ ‘Driving Finger Roll Shot’ ‘Running Layup Shot’ ‘Finger Roll Shot’ ‘Fadeaway Jump Shot’ ‘Follow Up Dunk Shot’ ‘Hook Shot’ ‘Turnaround Hook Shot’ ‘Jump Hook Shot’ ‘Running Finger Roll Shot’ ‘Jump Bank Shot’ ‘Turnaround Finger Roll Shot’ ‘Hook Bank Shot’ ‘Driving Hook Shot’ ‘Running Tip Shot’ ‘Running Reverse Layup Shot’ ‘Driving Finger Roll Layup Shot’ ‘Fadeaway Bank shot’ ‘Pullup Jump shot’ ‘Finger Roll Layup Shot’ ‘Turnaround Fadeaway shot’ ‘Driving Reverse Layup Shot’ ‘Driving Slam Dunk Shot’ ‘Step Back Jump shot’ ‘Turnaround Bank shot’ ‘Reverse Slam Dunk Shot’ ‘Floating Jump shot’ ‘Putback Slam Dunk Shot’ ‘Running Bank shot’ ‘Driving Bank shot’ ‘Driving Jump shot’ ‘Putback Layup Shot’ ‘Putback Dunk Shot’ ‘Running Finger Roll Layup Shot’ ‘Pullup Bank shot’ ‘Running Slam Dunk Shot’ ‘Cutting Layup Shot’ ‘Driving Floating Jump Shot’ ‘Running Pull-Up Jump Shot’ ‘Tip Layup Shot’ ‘Driving Floating Bank Jump Shot’] [‘Jump Shot’ ‘Dunk’ ‘Layup’ ‘Tip Shot’ ‘Hook Shot’ ‘Bank Shot’] [‘2PT Field Goal’ ‘3PT Field Goal’] 2PT Field Goal 20285 3PT Field Goal 5412 Name: shot_type, dtype: int64
raw['season'].unique()
array([‘2000-01’, ‘2001-02’, ‘2002-03’, ‘2003-04’, ‘2004-05’, ‘2005-06’, ‘2006-07’, ‘2007-08’, ‘2008-09’, ‘2009-10’, ‘2010-11’, ‘2011-12’, ‘2012-13’, ‘2013-14’, ‘2014-15’, ‘2015-16’, ‘1996-97’, ‘1997-98’, ‘1998-99’, ‘1999-00’], dtype=object)
raw22 = raw['season'].apply(lambda x: int(x.split('-')[1]))
raw['season']=raw22
raw['season'].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 97, 98, 99, 0])
print(kebo.team_id.unique())
print(kebo['team_name'].unique())
[1610612747] [‘Los Angeles Lakers’]
pd.DataFrame({'matchup':kebo.matchup,'opponent':kebo.opponent})[0:5]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
matchupopponent
1LAL @ PORPOR
2LAL @ PORPOR
3LAL @ PORPOR
4LAL @ PORPOR
5LAL @ PORPOR
plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,color='blue')
plt.title('dist and shot_distance')
Text(0.5,1,’dist and shot_distance’) ![png](output_13_1.png)
df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'c'],
...     'key2':['one', 'two', 'one', 'two', 'one'],
...     'data1':np.random.randn(5),
...     'data2':np.random.randn(5)})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1key2data1data2
0aone-1.2088150.858278
1atwo0.1892390.536879
2bone-1.1888080.405909
3btwo-0.231954-0.137537
4cone0.358366-1.643352
# df.drop(4,1):1表示列
#df.drop(4)
# print(pd.get_dummies(df['key1'],prefix='key1'))
#拼接one-hot编码
df = pd.concat([df,pd.get_dummies(df['key1'],prefix='key1')],1)
#删除
df.drop('key1',1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key2data1data2key1_akey1_bkey1_c
0one-1.2088150.858278100
1two0.1892390.536879100
2one-1.1888080.405909010
3two-0.231954-0.137537010
4one0.358366-1.643352001
ss = df.groupby('key1')
for g in ss:
    print(g[1])

for g in ss:
    print(g)
key1 key2 data1 data2 0 a one 1.084443 -0.806561 1 a two -0.191386 2.743989 key1 key2 data1 data2 2 b one 0.302269 -0.560596 3 b two -0.213348 0.962007 key1 key2 data1 data2 4 c one -0.893631 -0.193283 (‘a’, key1 key2 data1 data2 0 a one 1.084443 -0.806561 1 a two -0.191386 2.743989) (‘b’, key1 key2 data1 data2 2 b one 0.302269 -0.560596 3 b two -0.213348 0.962007) (‘c’, key1 key2 data1 data2 4 c one -0.893631 -0.193283)
df['data2'].groupby(df['key1']).sum()
key1 a -2.167849 b -1.954537 Name: data2, dtype: float64
import matplotlib.cm as cm
plt.figure(figsize=(20,10))

def scatter_plot_by_category(feat):
    appha = 0.1
    gs = kebo.groupby(feat)
    cs = cm.rainbow(np.linspace(0,1,len(gs)))
    for g,c in zip(gs,cs):
        plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)

plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')

plt.subplot(132)df['data1'].groupby(df['key1']).sum()
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')

plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')
Text(0.5,1,’shot_zone_range’) ![png](output_18_1.png)
#one-hot
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]
Jump Shot 23485 Layup 5448 Dunk 1286 Tip Shot 184 Hook Shot 153 Bank Shot 141 Name: combined_shot_type, dtype: int64
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
combined_shot_type_Bank Shotcombined_shot_type_Dunkcombined_shot_type_Hook Shotcombined_shot_type_Jump Shotcombined_shot_type_Layupcombined_shot_type_Tip Shot
0000100
1000100
from sklearn import preprocessing

enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
enc.transform([[0, 0, 2], [1, 1, 0], [0, 2, 1], [1, 0, 2]]).toarray()
array([[1., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 1., 0.]])
#drop一些数据
filename = 'data.csv'
raw = pd.read_csv(filename)
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
    raw = raw.drop(drop, 1)
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]
#制作数据集
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
    raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
    raw = raw.drop(var, 1)
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', 1)

test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)
#建一个模型
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time

import numpy  as np
#等比数列
range_m = np.logspace(0,2,num=5)
print(range_m)
range_m = range_m.astype(int)
range_m
Jump Shot    23485
Layup         5448
Dunk          1286
Tip Shot       184
Hook Shot      153
Bank Shot      141
Name: combined_shot_type, dtype: int64
[  1.           3.16227766  10.          31.6227766  100.        ]





array([  1,   3,  10,  31, 100])
from sklearn.cross_validation import KFold

print('find best n_estimators for RandomForestClassfier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.linspace(1,100,num=10).astype(int)
for n in range_n:
    print('the number of trees: {0}'.format(n))
    t1= time.time()

    rfc_score = 0.
    #模型是什么
    rfc = RandomForestClassifier(n_estimators=n)
    for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle=True):
#             print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
# #             print(train_k,test_k)
#             print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
            rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])
            pred = rfc.predict(train_kobe.iloc[test_k])
            rfc_score += log_loss(train_label.iloc[test_k],pred)/10
    scores_n.append(rfc_score)
    print(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1))
print(best_n,min_score)
find best n_estimators for RandomForestClassfier...
the number of trees: 1
13.919386979412682
Done processing 1 trees (0.726sec)
the number of trees: 12
13.389844271562911
Done processing 12 trees (6.008sec)
the number of trees: 23
13.180159610972076
Done processing 23 trees (11.043sec)
the number of trees: 34
13.219150771746676
Done processing 34 trees (18.443sec)
the number of trees: 45
13.043065195199537
Done processing 45 trees (22.152sec)
the number of trees: 56
13.057856481381911
Done processing 56 trees (27.636sec)
the number of trees: 67
13.055167436843206
Done processing 67 trees (32.039sec)
the number of trees: 78
13.051153915112355
Done processing 78 trees (38.372sec)
the number of trees: 89
13.048438999530031
Done processing 89 trees (44.605sec)
the number of trees: 100
13.114318256666222
Done processing 100 trees (49.886sec)
45 13.043065195199537
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
    print("the max depth : {0}".format(m))
    t1 = time.time()

    rfc_score = 0.
    rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
    for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_m.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_m = m

    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (2.389sec)
the max depth : 10
Done processing 10 trees (6.044sec)
the max depth : 100
Done processing 100 trees (21.773sec)
10 11.06186210729279
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')
Text(0.5,0,'max depth')

这里写图片描述

model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)
pred = model.predict(train_kobe)
count = 0 
for i,j in zip(train_label,pred):
    if i == j:
        count +=1
print("acc is {0}".format(count/len(train_label)))
acc is 0.6850215978518893
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_kobe,train_label, test_size=0.3, random_state=0) 
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(X_train,y_train)
pre = model.predict(X_test)

count = 0 
for i,j in zip(y_test,pred):
    if i == j:
        count +=1
print("acc is {0}".format(count/len(pre)))
acc is 0.524254215304799
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值