科比生涯数据集分析与预测

最新推荐文章于 2023-10-12 01:40:28 发布

-Heres-

最新推荐文章于 2023-10-12 01:40:28 发布

阅读量3.4k

点赞数

分类专栏：机器学习 python与数据挖掘

python与数据挖掘同时被 2 个专栏收录

21 篇文章 2 订阅

订阅专栏

机器学习

11 篇文章 0 订阅

订阅专栏

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

filename = 'data.csv'
raw = pd.read_csv(filename)
print(raw.shape)
raw.head()

(30697, 25)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	period	…	shot_type	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent	shot_id
0	Jump Shot	Jump Shot	10	20000012	33.9723	167	72	-118.1028	10	1	…	2PT Field Goal	Right Side(R)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	1
1	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	1	…	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	2
2	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	1	…	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	3
3	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	1	…	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	4
4	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	2	…	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	5

5 rows × 25 columns

pd.notnull(raw['shot_made_flag'])

0 False 1 True 2 True 3 True 4 True 5 True 6 True 7 False 8 True 9 True 10 True 11 True 12 True 13 True 14 True 15 True 16 False 17 True 18 True 19 False 20 True 21 True 22 True 23 True 24 True 25 True 26 True 27 True 28 True 29 True … 30667 True 30668 False 30669 True 30670 True 30671 True 30672 True 30673 True 30674 True 30675 True 30676 True 30677 True 30678 True 30679 True 30680 False 30681 True 30682 False 30683 True 30684 True 30685 True 30686 False 30687 True 30688 True 30689 True 30690 True 30691 True 30692 True 30693 False 30694 True 30695 True 30696 True Name: shot_made_flag, Length: 30697, dtype: bool

#
kebo = raw[pd.notnull(raw['shot_made_flag'])]
print(kebo.shape)
kebo.head()

(25697, 25)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	period	…	shot_type	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent	shot_id
1	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	1	…	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	2
2	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	1	…	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	3
3	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	1	…	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	4
4	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	2	…	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	5
5	Jump Shot	Jump Shot	244	20000012	34.0553	-145	-11	-118.4148	9	3	…	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	6

5 rows × 25 columns

alpha = 0.02
plt.figure(figsize=(15,15))

plt.subplot(121)
plt.scatter(kebo.loc_x,kebo.loc_y,color='R',alpha=alpha)
plt.title('loc_x and loc_y')

plt.subplot(122)
plt.scatter(kebo.lon,kebo.lat,color='B',alpha=alpha)
plt.title('lat and lon')

Text(0.5,1,’lat and lon’) ![png](output_4_1.png)

raw['dist'] = np.sqrt(raw['loc_x']**2+raw['loc_y']**2)
raw.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	period	…	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent	shot_id	dist
0	Jump Shot	Jump Shot	10	20000012	33.9723	167	72	-118.1028	10	1	…	Right Side(R)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	1	181.859836
1	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	1	…	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	2	157.000000
2	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	1	…	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	3	168.600119
3	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	1	…	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	4	222.865430
4	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	2	…	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	5	0.000000

5 rows × 26 columns

loc_x_zero = raw['loc_x']==0
raw['angle'] = np.array([0]*len(raw))
#raw.head()
# ~loc_x_zero:取反
raw['angle'][~loc_x_zero]=np.arctan(raw['loc_y'][~loc_x_zero]/raw['loc_y'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi/2

/home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy “”” /home/heres/.conda/envs/GPU_test/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

raw['remaining_time'] = raw['minutes_remaining']*60+raw['seconds_remaining']

#当前列里面有多少不重复的值
print(kebo.action_type.unique())
print(kebo.combined_shot_type.unique())
print(kebo['shot_type'].unique())
print(kebo.shot_type.value_counts())

[‘Jump Shot’ ‘Driving Dunk Shot’ ‘Layup Shot’ ‘Running Jump Shot’ ‘Reverse Dunk Shot’ ‘Slam Dunk Shot’ ‘Driving Layup Shot’ ‘Turnaround Jump Shot’ ‘Reverse Layup Shot’ ‘Tip Shot’ ‘Running Hook Shot’ ‘Alley Oop Dunk Shot’ ‘Dunk Shot’ ‘Alley Oop Layup shot’ ‘Running Dunk Shot’ ‘Driving Finger Roll Shot’ ‘Running Layup Shot’ ‘Finger Roll Shot’ ‘Fadeaway Jump Shot’ ‘Follow Up Dunk Shot’ ‘Hook Shot’ ‘Turnaround Hook Shot’ ‘Jump Hook Shot’ ‘Running Finger Roll Shot’ ‘Jump Bank Shot’ ‘Turnaround Finger Roll Shot’ ‘Hook Bank Shot’ ‘Driving Hook Shot’ ‘Running Tip Shot’ ‘Running Reverse Layup Shot’ ‘Driving Finger Roll Layup Shot’ ‘Fadeaway Bank shot’ ‘Pullup Jump shot’ ‘Finger Roll Layup Shot’ ‘Turnaround Fadeaway shot’ ‘Driving Reverse Layup Shot’ ‘Driving Slam Dunk Shot’ ‘Step Back Jump shot’ ‘Turnaround Bank shot’ ‘Reverse Slam Dunk Shot’ ‘Floating Jump shot’ ‘Putback Slam Dunk Shot’ ‘Running Bank shot’ ‘Driving Bank shot’ ‘Driving Jump shot’ ‘Putback Layup Shot’ ‘Putback Dunk Shot’ ‘Running Finger Roll Layup Shot’ ‘Pullup Bank shot’ ‘Running Slam Dunk Shot’ ‘Cutting Layup Shot’ ‘Driving Floating Jump Shot’ ‘Running Pull-Up Jump Shot’ ‘Tip Layup Shot’ ‘Driving Floating Bank Jump Shot’] [‘Jump Shot’ ‘Dunk’ ‘Layup’ ‘Tip Shot’ ‘Hook Shot’ ‘Bank Shot’] [‘2PT Field Goal’ ‘3PT Field Goal’] 2PT Field Goal 20285 3PT Field Goal 5412 Name: shot_type, dtype: int64

raw['season'].unique()

array([‘2000-01’, ‘2001-02’, ‘2002-03’, ‘2003-04’, ‘2004-05’, ‘2005-06’, ‘2006-07’, ‘2007-08’, ‘2008-09’, ‘2009-10’, ‘2010-11’, ‘2011-12’, ‘2012-13’, ‘2013-14’, ‘2014-15’, ‘2015-16’, ‘1996-97’, ‘1997-98’, ‘1998-99’, ‘1999-00’], dtype=object)

raw22 = raw['season'].apply(lambda x: int(x.split('-')[1]))
raw['season']=raw22
raw['season'].unique()

array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 97, 98, 99, 0])

print(kebo.team_id.unique())
print(kebo['team_name'].unique())

[1610612747] [‘Los Angeles Lakers’]

pd.DataFrame({'matchup':kebo.matchup,'opponent':kebo.opponent})[0:5]

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	matchup	opponent
1	LAL @ POR	POR
2	LAL @ POR	POR
3	LAL @ POR	POR
4	LAL @ POR	POR
5	LAL @ POR	POR

plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,color='blue')
plt.title('dist and shot_distance')

Text(0.5,1,’dist and shot_distance’) ![png](output_13_1.png)

df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'c'],
...     'key2':['one', 'two', 'one', 'two', 'one'],
...     'data1':np.random.randn(5),
...     'data2':np.random.randn(5)})
df

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	key1	key2	data1	data2
0	a	one	-1.208815	0.858278
1	a	two	0.189239	0.536879
2	b	one	-1.188808	0.405909
3	b	two	-0.231954	-0.137537
4	c	one	0.358366	-1.643352

# df.drop(4，1):1表示列
#df.drop(4)
# print(pd.get_dummies(df['key1'],prefix='key1'))
#拼接one-hot编码
df = pd.concat([df,pd.get_dummies(df['key1'],prefix='key1')],1)
#删除
df.drop('key1',1)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	key2	data1	data2	key1_a	key1_b	key1_c
0	one	-1.208815	0.858278	1	0	0
1	two	0.189239	0.536879	1	0	0
2	one	-1.188808	0.405909	0	1	0
3	two	-0.231954	-0.137537	0	1	0
4	one	0.358366	-1.643352	0	0	1

ss = df.groupby('key1')
for g in ss:
    print(g[1])

for g in ss:
    print(g)

key1 key2 data1 data2 0 a one 1.084443 -0.806561 1 a two -0.191386 2.743989 key1 key2 data1 data2 2 b one 0.302269 -0.560596 3 b two -0.213348 0.962007 key1 key2 data1 data2 4 c one -0.893631 -0.193283 (‘a’, key1 key2 data1 data2 0 a one 1.084443 -0.806561 1 a two -0.191386 2.743989) (‘b’, key1 key2 data1 data2 2 b one 0.302269 -0.560596 3 b two -0.213348 0.962007) (‘c’, key1 key2 data1 data2 4 c one -0.893631 -0.193283)

df['data2'].groupby(df['key1']).sum()

key1 a -2.167849 b -1.954537 Name: data2, dtype: float64

import matplotlib.cm as cm
plt.figure(figsize=(20,10))

def scatter_plot_by_category(feat):
    appha = 0.1
    gs = kebo.groupby(feat)
    cs = cm.rainbow(np.linspace(0,1,len(gs)))
    for g,c in zip(gs,cs):
        plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)

plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')

plt.subplot(132)df['data1'].groupby(df['key1']).sum()
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')

plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')

Text(0.5,1,’shot_zone_range’) ![png](output_18_1.png)

#one-hot
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]

Jump Shot 23485 Layup 5448 Dunk 1286 Tip Shot 184 Hook Shot 153 Bank Shot 141 Name: combined_shot_type, dtype: int64

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	combined_shot_type_Bank Shot	combined_shot_type_Dunk	combined_shot_type_Hook Shot	combined_shot_type_Jump Shot	combined_shot_type_Layup	combined_shot_type_Tip Shot
0	0	0	0	1	0	0
1	0	0	0	1	0	0

from sklearn import preprocessing

enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
enc.transform([[0, 0, 2], [1, 1, 0], [0, 2, 1], [1, 0, 2]]).toarray()

array([[1., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 1., 0.]])

#drop一些数据
filename = 'data.csv'
raw = pd.read_csv(filename)
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
    raw = raw.drop(drop, 1)
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]
#制作数据集
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
    raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
    raw = raw.drop(var, 1)
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', 1)

test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)
#建一个模型
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time

import numpy  as np
#等比数列
range_m = np.logspace(0,2,num=5)
print(range_m)
range_m = range_m.astype(int)
range_m

Jump Shot    23485
Layup         5448
Dunk          1286
Tip Shot       184
Hook Shot      153
Bank Shot      141
Name: combined_shot_type, dtype: int64
[  1.           3.16227766  10.          31.6227766  100.        ]





array([  1,   3,  10,  31, 100])

from sklearn.cross_validation import KFold

print('find best n_estimators for RandomForestClassfier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.linspace(1,100,num=10).astype(int)
for n in range_n:
    print('the number of trees: {0}'.format(n))
    t1= time.time()

    rfc_score = 0.
    #模型是什么
    rfc = RandomForestClassifier(n_estimators=n)
    for train_k,test_k in KFold(len(train_kobe),n_folds=10,shuffle=True):
#             print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
# #             print(train_k,test_k)
#             print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
            rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])
            pred = rfc.predict(train_kobe.iloc[test_k])
            rfc_score += log_loss(train_label.iloc[test_k],pred)/10
    scores_n.append(rfc_score)
    print(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1))
print(best_n,min_score)

find best n_estimators for RandomForestClassfier...
the number of trees: 1
13.919386979412682
Done processing 1 trees (0.726sec)
the number of trees: 12
13.389844271562911
Done processing 12 trees (6.008sec)
the number of trees: 23
13.180159610972076
Done processing 23 trees (11.043sec)
the number of trees: 34
13.219150771746676
Done processing 34 trees (18.443sec)
the number of trees: 45
13.043065195199537
Done processing 45 trees (22.152sec)
the number of trees: 56
13.057856481381911
Done processing 56 trees (27.636sec)
the number of trees: 67
13.055167436843206
Done processing 67 trees (32.039sec)
the number of trees: 78
13.051153915112355
Done processing 78 trees (38.372sec)
the number of trees: 89
13.048438999530031
Done processing 89 trees (44.605sec)
the number of trees: 100
13.114318256666222
Done processing 100 trees (49.886sec)
45 13.043065195199537

print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
    print("the max depth : {0}".format(m))
    t1 = time.time()

    rfc_score = 0.
    rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
    for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_m.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_m = m

    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)

Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (2.389sec)
the max depth : 10
Done processing 10 trees (6.044sec)
the max depth : 100
Done processing 100 trees (21.773sec)
10 11.06186210729279

plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')

Text(0.5,0,'max depth')

这里写图片描述

model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)
pred = model.predict(train_kobe)
count = 0 
for i,j in zip(train_label,pred):
    if i == j:
        count +=1
print("acc is {0}".format(count/len(train_label)))

acc is 0.6850215978518893

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_kobe,train_label, test_size=0.3, random_state=0) 
model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(X_train,y_train)
pre = model.predict(X_test)

count = 0 
for i,j in zip(y_test,pred):
    if i == j:
        count +=1
print("acc is {0}".format(count/len(pre)))

acc is 0.524254215304799

-Heres-

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
4
评论
科比生涯数据集分析与预测

import numpy as npimport pandas as pdimport matplotlib.pyplot as plt%matplotlib inlinefrom sklearn.ensemble import RandomForestClassifierfrom sklearn.cross_validation import KFoldfilen...
复制链接

扫一扫

专栏目录