import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
加载数据并创建一些有用的字段,
将新创建的字段显示为完整性检查
#%% load training data
allData = pd.read_csv('C:/Users/admin/Desktop/kaggle/科比/数据/data.csv')
allData.head()
action_type | combined_shot_type | game_event_id | game_id | lat | loc_x | loc_y | lon | minutes_remaining | period | ... | shot_type | shot_zone_area | shot_zone_basic | shot_zone_range | team_id | team_name | game_date | matchup | opponent | shot_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Jump Shot | Jump Shot | 10 | 20000012 | 33.9723 | 167 | 72 | -118.1028 | 10 | 1 | ... | 2PT Field Goal | Right Side(R) | Mid-Range | 16-24 ft. | 1610612747 | Los Angeles Lakers | 2000-10-31 | LAL @ POR | POR | 1 |
1 | Jump Shot | Jump Shot | 12 | 20000012 | 34.0443 | -157 | 0 | -118.4268 | 10 | 1 | ... | 2PT Field Goal | Left Side(L) | Mid-Range | 8-16 ft. | 1610612747 | Los Angeles Lakers | 2000-10-31 | LAL @ POR | POR | 2 |
2 | Jump Shot | Jump Shot | 35 | 20000012 | 33.9093 | -101 | 135 | -118.3708 | 7 | 1 | ... | 2PT Field Goal | Left Side Center(LC) | Mid-Range | 16-24 ft. | 1610612747 | Los Angeles Lakers | 2000-10-31 | LAL @ POR | POR | 3 |
3 | Jump Shot | Jump Shot | 43 | 20000012 | 33.8693 | 138 | 175 | -118.1318 | 6 | 1 | ... | 2PT Field Goal | Right Side Center(RC) | Mid-Range | 16-24 ft. | 1610612747 | Los Angeles Lakers | 2000-10-31 | LAL @ POR | POR | 4 |
4 | Driving Dunk Shot | Dunk | 155 | 20000012 | 34.0443 | 0 | 0 | -118.2698 | 6 | 2 | ... | 2PT Field Goal | Center(C) | Restricted Area | Less Than 8 ft. | 1610612747 | Los Angeles Lakers | 2000-10-31 | LAL @ POR | POR | 5 |
5 rows × 25 columns
# shape
allData.shape
(30697, 25)
# more info on the data
allData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 25 columns):
action_type 30697 non-null object
combined_shot_type 30697 non-null object
game_event_id 30697 non-null int64
game_id 30697 non-null int64
lat 30697 non-null float64
loc_x 30697 non-null int64
loc_y 30697 non-null int64
lon 30697 non-null float64
minutes_remaining 30697 non-null int64
period 30697 non-null int64
playoffs 30697 non-null int64
season 30697 non-null object
seconds_remaining 30697 non-null int64
shot_distance 30697 non-null int64
shot_made_flag 25697 non-null float64
shot_type 30697 non-null object
shot_zone_area 30697 non-null object
shot_zone_basic 30697 non-null object
shot_zone_range 30697 non-null object
team_id 30697 non-null int64
team_name 30697 non-null object
game_date 30697 non-null object
matchup 30697 non-null object
opponent 30697 non-null object
shot_id 30697 non-null int64
dtypes: float64(3), int64(11), object(11)
memory usage: 4.6+ MB
#保留标签不为缺失值的数据
data = allData[pd.notnull(allData['shot_made_flag'])]
通过对数据的分析,发现特征既有科比投篮的位置坐标loc_x,loc_y又有经度lat,纬度lon,猜测这两组特征重复,我们就来对比一下:
#分配画布大小
plt.figure(figsize = (10,10))
plt.subplot(1,2,1)
#alpha为不透明度,loc_x,loc_y为科比投篮的位置
plt.scatter(data.loc_x,data.loc_y,color ='g',alpha = 0.05)
plt.title('loc_x and loc_y')
plt.subplot(1,2,2)
#lat为纬度,lon为经度
plt.scatter(data.lon,data.lat,color ='b',alpha = 0.05)
plt.title('lat and lon')
通过比较,科比投篮的位置和经纬度这两组特征是类似的,保留一组特征即可。
又发现原始特征中既有分钟又有秒,所以可以把这两组特征进行合并:
data['remain_time'] = data['minutes_remaining']*60 + data['seconds_remaining']
type(data.loc_x)
pandas.core.series.Series
type(data['loc_x'])
pandas.core.series.Series
#显示某一特征里的独一无二值,这里显示科比的得分类型,有二分和三分两种
print(data['shot_type'].unique())
#显示某一特征里各种值的数量,可见科比二分得分20285次,三分得分5412次
print(data['shot_type'].value_counts())
['2PT Field Goal' '3PT Field Goal']
2PT Field Goal 20285
3PT Field Goal 5412
Name: shot_type, dtype: int64
又发现’season’这个特征数据中用-相连,是计算机不认识的数据,所以对其进行一个分割:
data['season']
1 2000-01
2 2000-01
3 2000-01
4 2000-01
5 2000-01
...
30691 1999-00
30692 1999-00
30694 1999-00
30695 1999-00
30696 1999-00
Name: season, Length: 25697, dtype: object
data['season'].unique()
array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06',
'2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12',
'2012-13', '2013-14', '2014-15', '2015-16', '1996-97', '1997-98',
'1998-99', '1999-00'], dtype=object)
#apply是调用函数,这里用python的匿名函数lambda,对数据进行分割,以-为分割符,【1】选择分隔符右边的,【0】选择分隔符左边的,分割后强制转换成int类型
data['season'] = data['season'].apply(lambda x: int(x.split('-')[1]) )
data['season'].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 97,
98, 99, 0], dtype=int64)
再还有‘shot_zone_basic’,‘shot_zone_basic’,'shot_zone_range’这三个特征是不是也类似,再来比较一下:
import matplotlib.cm as cm
plt.figure(figsize=(20,10))
#data.groupyby(feature),是将数据根据feature里的类进行分类
def scatterbygroupby(feature):
alpha = 0.1
gb = data.groupby(feature)
cl = cm.rainbow(np.linspace(0,1,len(gb)))
for g,c in zip(gb,cl):
plt.scatter(g[1].loc_x,g[1].loc_y,color = c,alpha = alpha) #这里为什么是g[1]还没搞清楚,希望知道的可以告知一下,谢谢!
#分配画布大小
plt.figure(figsize = (20,9))
plt.subplot(1,3,1)
scatterbygroupby('shot_zone_basic')
plt.title('shot_zone_basic')
plt.subplot(1,3,2)
scatterbygroupby('shot_zone_range')
plt.title('shot_zone_range')
plt.subplot(1,3,3)
scatterbygroupby('shot_zone_area')
plt.title('shot_zone_area')
可以看到这三组特征也类似,保留一组即可。下面把没用的或类似的或重复的特征从原始数据中去掉:
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
'shot_distance', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
data = data.drop(drop, 1) #drop(labels, axis)
data.head()
action_type | combined_shot_type | loc_x | loc_y | period | playoffs | season | shot_made_flag | shot_type | opponent | remain_time | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | Jump Shot | Jump Shot | -157 | 0 | 1 | 0 | 1 | 0.0 | 2PT Field Goal | POR | 622 |
2 | Jump Shot | Jump Shot | -101 | 135 | 1 | 0 | 1 | 1.0 | 2PT Field Goal | POR | 465 |
3 | Jump Shot | Jump Shot | 138 | 175 | 1 | 0 | 1 | 0.0 | 2PT Field Goal | POR | 412 |
4 | Driving Dunk Shot | Dunk | 0 | 0 | 2 | 0 | 1 | 1.0 | 2PT Field Goal | POR | 379 |
5 | Jump Shot | Jump Shot | -145 | -11 | 3 | 0 | 1 | 0.0 | 2PT Field Goal | POR | 572 |
发现前两个特征里都是英文,是计算机所不认识的,这里采取一种经典的编码方式,one-hot编码,one-hot编码即将特征中的多个属性当作多个新的特征,并且每个数据在这多个特征中只有一个为1,其余都为0:
print(data['action_type'].unique())
['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
'Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot'
'Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot'
'Running Hook Shot' 'Alley Oop Dunk Shot' 'Dunk Shot'
'Alley Oop Layup shot' 'Running Dunk Shot' 'Driving Finger Roll Shot'
'Running Layup Shot' 'Finger Roll Shot' 'Fadeaway Jump Shot'
'Follow Up Dunk Shot' 'Hook Shot' 'Turnaround Hook Shot' 'Jump Hook Shot'
'Running Finger Roll Shot' 'Jump Bank Shot' 'Turnaround Finger Roll Shot'
'Hook Bank Shot' 'Driving Hook Shot' 'Running Tip Shot'
'Running Reverse Layup Shot' 'Driving Finger Roll Layup Shot'
'Fadeaway Bank shot' 'Pullup Jump shot' 'Finger Roll Layup Shot'
'Turnaround Fadeaway shot' 'Driving Reverse Layup Shot'
'Driving Slam Dunk Shot' 'Step Back Jump shot' 'Turnaround Bank shot'
'Reverse Slam Dunk Shot' 'Floating Jump shot' 'Putback Slam Dunk Shot'
'Running Bank shot' 'Driving Bank shot' 'Driving Jump shot'
'Putback Layup Shot' 'Putback Dunk Shot' 'Running Finger Roll Layup Shot'
'Pullup Bank shot' 'Running Slam Dunk Shot' 'Cutting Layup Shot'
'Driving Floating Jump Shot' 'Running Pull-Up Jump Shot' 'Tip Layup Shot'
'Driving Floating Bank Jump Shot']
print(data['combined_shot_type'].unique())
['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']
print(data['shot_type'].unique())
['2PT Field Goal' '3PT Field Goal']
print(data['opponent'].unique())
['POR' 'UTA' 'VAN' 'LAC' 'HOU' 'SAS' 'DEN' 'SAC' 'CHI' 'GSW' 'MIN' 'IND'
'SEA' 'DAL' 'PHI' 'DET' 'MIL' 'TOR' 'MIA' 'PHX' 'CLE' 'NJN' 'NYK' 'CHA'
'WAS' 'ORL' 'ATL' 'MEM' 'BOS' 'NOH' 'NOP' 'OKC' 'BKN']
a = ['action_type', 'combined_shot_type', 'shot_type', 'opponent']
for i in a:
#使用one-hot编码,将a中的特征里的属性值都当作新的特征附在数据的列上,特征名为前缀prefix加上该属性名
data = pd.concat([data, pd.get_dummies(data[i], prefix=i)], 1)
data = data.drop(i, 1)
data.head()
loc_x | loc_y | period | playoffs | season | shot_made_flag | remain_time | action_type_Alley Oop Dunk Shot | action_type_Alley Oop Layup shot | action_type_Cutting Layup Shot | ... | opponent_PHI | opponent_PHX | opponent_POR | opponent_SAC | opponent_SAS | opponent_SEA | opponent_TOR | opponent_UTA | opponent_VAN | opponent_WAS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | -157 | 0 | 1 | 0 | 1 | 0.0 | 622 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | -101 | 135 | 1 | 0 | 1 | 1.0 | 465 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 138 | 175 | 1 | 0 | 1 | 0.0 | 412 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 2 | 0 | 1 | 1.0 | 379 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | -145 | -11 | 3 | 0 | 1 | 0.0 | 572 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 103 columns
发现列数明显增加了很多,新加的列数即把那些计算机不认识的属性都当作特征处理。直到这里,数据才清洗完毕。
三、构造训练集、训练标签,测试集、测试标签
train_data = data.drop('shot_made_flag',1)#去掉标签栏
train_label = data['shot_made_flag']#标签栏
#将标签为缺失值的数据作为测试集,第一步处理的时候已经将缺失值的数据去掉了,所以这里只是熟悉一下测试集和测试标签的构造,最后我们用训练集进行简单测试一下就好。也可以from sklearn.model_selection import train_test_split将原始有用的即有标签的数据集进行一个切割,分成训练集和测试集,用测试集进行测试。
test_data = data[pd.isnull(data['shot_made_flag'])]
test_data = test_data.drop('shot_made_flag',1)
四、选择模型
这里选择集成算法中的随机森林,首先选择用多少棵树:
#这里使用随机森林模型进行预测,首先寻找最适的树的棵树
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix,accuracy_score,log_loss
import time
#logspace默认以10为底,然后以分割的数为幂,若要以2为底,在后面加上base = 2
#这里是平均分割,3是指在0和4之间平均分割3个数而不像linspace是间隔,这里应为8,16,32
range_n = np.logspace(3,5,3,base = 2).astype(int)
range_n
array([ 8, 16, 32])