Kobe

最新推荐文章于 2022-06-29 17:55:43 发布

静听山水

最新推荐文章于 2022-06-29 17:55:43 发布

阅读量549

点赞数

分类专栏：机器学习

原文链接：https://blog.csdn.net/qq_24946843/article/details/81835958

版权

机器学习专栏收录该内容

55 篇文章 14 订阅

订阅专栏

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

加载数据并创建一些有用的字段，
将新创建的字段显示为完整性检查

#%% load training data
allData = pd.read_csv('C:/Users/admin/Desktop/kaggle/科比/数据/data.csv')
allData.head()

	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	period	...	shot_type	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent	shot_id
0	Jump Shot	Jump Shot	10	20000012	33.9723	167	72	-118.1028	10	1	...	2PT Field Goal	Right Side(R)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	1
1	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	1	...	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	2
2	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	1	...	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	3
3	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	1	...	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	4
4	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	2	...	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	5

5 rows × 25 columns

# shape
allData.shape

(30697, 25)

# more info on the data
allData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 25 columns):
action_type           30697 non-null object
combined_shot_type    30697 non-null object
game_event_id         30697 non-null int64
game_id               30697 non-null int64
lat                   30697 non-null float64
loc_x                 30697 non-null int64
loc_y                 30697 non-null int64
lon                   30697 non-null float64
minutes_remaining     30697 non-null int64
period                30697 non-null int64
playoffs              30697 non-null int64
season                30697 non-null object
seconds_remaining     30697 non-null int64
shot_distance         30697 non-null int64
shot_made_flag        25697 non-null float64
shot_type             30697 non-null object
shot_zone_area        30697 non-null object
shot_zone_basic       30697 non-null object
shot_zone_range       30697 non-null object
team_id               30697 non-null int64
team_name             30697 non-null object
game_date             30697 non-null object
matchup               30697 non-null object
opponent              30697 non-null object
shot_id               30697 non-null int64
dtypes: float64(3), int64(11), object(11)
memory usage: 4.6+ MB

#保留标签不为缺失值的数据
data = allData[pd.notnull(allData['shot_made_flag'])]

通过对数据的分析，发现特征既有科比投篮的位置坐标loc_x,loc_y又有经度lat，纬度lon，猜测这两组特征重复，我们就来对比一下：

#分配画布大小
plt.figure(figsize = (10,10))
 
plt.subplot(1,2,1)
#alpha为不透明度，loc_x，loc_y为科比投篮的位置
plt.scatter(data.loc_x,data.loc_y,color ='g',alpha = 0.05)
plt.title('loc_x and loc_y')
 
plt.subplot(1,2,2)
#lat为纬度，lon为经度
plt.scatter(data.lon,data.lat,color ='b',alpha = 0.05)
plt.title('lat and lon')

在这里插入图片描述

通过比较，科比投篮的位置和经纬度这两组特征是类似的，保留一组特征即可。

又发现原始特征中既有分钟又有秒，所以可以把这两组特征进行合并：

data['remain_time'] = data['minutes_remaining']*60 + data['seconds_remaining']

type(data.loc_x)

pandas.core.series.Series

type(data['loc_x'])

pandas.core.series.Series

#显示某一特征里的独一无二值，这里显示科比的得分类型，有二分和三分两种
print(data['shot_type'].unique())
 
#显示某一特征里各种值的数量，可见科比二分得分20285次，三分得分5412次
print(data['shot_type'].value_counts())

['2PT Field Goal' '3PT Field Goal']
2PT Field Goal    20285
3PT Field Goal     5412
Name: shot_type, dtype: int64

又发现’season’这个特征数据中用-相连，是计算机不认识的数据，所以对其进行一个分割：

data['season']

1        2000-01
2        2000-01
3        2000-01
4        2000-01
5        2000-01
          ...   
30691    1999-00
30692    1999-00
30694    1999-00
30695    1999-00
30696    1999-00
Name: season, Length: 25697, dtype: object

data['season'].unique()

array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06',
       '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12',
       '2012-13', '2013-14', '2014-15', '2015-16', '1996-97', '1997-98',
       '1998-99', '1999-00'], dtype=object)

#apply是调用函数，这里用python的匿名函数lambda，对数据进行分割，以-为分割符，【1】选择分隔符右边的，【0】选择分隔符左边的，分割后强制转换成int类型
data['season'] = data['season'].apply(lambda x: int(x.split('-')[1]) )
data['season'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 97,
       98, 99,  0], dtype=int64)

再还有‘shot_zone_basic’，‘shot_zone_basic’，'shot_zone_range’这三个特征是不是也类似，再来比较一下：

import matplotlib.cm as cm

plt.figure(figsize=(20,10))
 
#data.groupyby(feature),是将数据根据feature里的类进行分类
def scatterbygroupby(feature):
    alpha = 0.1
    gb = data.groupby(feature)
    cl = cm.rainbow(np.linspace(0,1,len(gb)))
    for g,c in zip(gb,cl):
        plt.scatter(g[1].loc_x,g[1].loc_y,color = c,alpha = alpha) #这里为什么是g[1]还没搞清楚，希望知道的可以告知一下，谢谢!

#分配画布大小
plt.figure(figsize = (20,9))

plt.subplot(1,3,1)
scatterbygroupby('shot_zone_basic')
plt.title('shot_zone_basic')
 
plt.subplot(1,3,2)
scatterbygroupby('shot_zone_range')
plt.title('shot_zone_range')
 
plt.subplot(1,3,3)
scatterbygroupby('shot_zone_area')
plt.title('shot_zone_area')

在这里插入图片描述

可以看到这三组特征也类似，保留一组即可。下面把没用的或类似的或重复的特征从原始数据中去掉：

drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
         'shot_distance', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
    data = data.drop(drop, 1)  #drop(labels, axis)
data.head()

	action_type	combined_shot_type	loc_x	loc_y	period	season	shot_made_flag	shot_type	opponent	remain_time
1	Jump Shot	Jump Shot	-157	0	1	1	0.0	2PT Field Goal	POR	622
2	Jump Shot	Jump Shot	-101	135	1	1	1.0	2PT Field Goal	POR	465
3	Jump Shot	Jump Shot	138	175	1	1	0.0	2PT Field Goal	POR	412
4	Driving Dunk Shot	Dunk	0	0	2	1	1.0	2PT Field Goal	POR	379
5	Jump Shot	Jump Shot	-145	-11	3	1	0.0	2PT Field Goal	POR	572

发现前两个特征里都是英文，是计算机所不认识的，这里采取一种经典的编码方式，one-hot编码，one-hot编码即将特征中的多个属性当作多个新的特征，并且每个数据在这多个特征中只有一个为1，其余都为0：

print(data['action_type'].unique())

['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
 'Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot'
 'Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot'
 'Running Hook Shot' 'Alley Oop Dunk Shot' 'Dunk Shot'
 'Alley Oop Layup shot' 'Running Dunk Shot' 'Driving Finger Roll Shot'
 'Running Layup Shot' 'Finger Roll Shot' 'Fadeaway Jump Shot'
 'Follow Up Dunk Shot' 'Hook Shot' 'Turnaround Hook Shot' 'Jump Hook Shot'
 'Running Finger Roll Shot' 'Jump Bank Shot' 'Turnaround Finger Roll Shot'
 'Hook Bank Shot' 'Driving Hook Shot' 'Running Tip Shot'
 'Running Reverse Layup Shot' 'Driving Finger Roll Layup Shot'
 'Fadeaway Bank shot' 'Pullup Jump shot' 'Finger Roll Layup Shot'
 'Turnaround Fadeaway shot' 'Driving Reverse Layup Shot'
 'Driving Slam Dunk Shot' 'Step Back Jump shot' 'Turnaround Bank shot'
 'Reverse Slam Dunk Shot' 'Floating Jump shot' 'Putback Slam Dunk Shot'
 'Running Bank shot' 'Driving Bank shot' 'Driving Jump shot'
 'Putback Layup Shot' 'Putback Dunk Shot' 'Running Finger Roll Layup Shot'
 'Pullup Bank shot' 'Running Slam Dunk Shot' 'Cutting Layup Shot'
 'Driving Floating Jump Shot' 'Running Pull-Up Jump Shot' 'Tip Layup Shot'
 'Driving Floating Bank Jump Shot']

print(data['combined_shot_type'].unique())

['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']

print(data['shot_type'].unique())

['2PT Field Goal' '3PT Field Goal']

print(data['opponent'].unique())

['POR' 'UTA' 'VAN' 'LAC' 'HOU' 'SAS' 'DEN' 'SAC' 'CHI' 'GSW' 'MIN' 'IND'
 'SEA' 'DAL' 'PHI' 'DET' 'MIL' 'TOR' 'MIA' 'PHX' 'CLE' 'NJN' 'NYK' 'CHA'
 'WAS' 'ORL' 'ATL' 'MEM' 'BOS' 'NOH' 'NOP' 'OKC' 'BKN']

a = ['action_type', 'combined_shot_type', 'shot_type', 'opponent']
for i in a:
    #使用one-hot编码，将a中的特征里的属性值都当作新的特征附在数据的列上，特征名为前缀prefix加上该属性名
    data = pd.concat([data, pd.get_dummies(data[i], prefix=i)], 1)
    data = data.drop(i, 1)
data.head()

	loc_x	loc_y	period	season	shot_made_flag	remain_time	...	opponent_POR
1	-157	0	1	1	0.0	622	...	1
2	-101	135	1	1	1.0	465	...	1
3	138	175	1	1	0.0	412	...	1
4	0	0	2	1	1.0	379	...	1
5	-145	-11	3	1	0.0	572	...	1

5 rows × 103 columns

发现列数明显增加了很多，新加的列数即把那些计算机不认识的属性都当作特征处理。直到这里，数据才清洗完毕。

三、构造训练集、训练标签，测试集、测试标签

train_data = data.drop('shot_made_flag',1)#去掉标签栏
train_label = data['shot_made_flag']#标签栏 
#将标签为缺失值的数据作为测试集，第一步处理的时候已经将缺失值的数据去掉了，所以这里只是熟悉一下测试集和测试标签的构造，最后我们用训练集进行简单测试一下就好。也可以from sklearn.model_selection import train_test_split将原始有用的即有标签的数据集进行一个切割，分成训练集和测试集，用测试集进行测试。
test_data = data[pd.isnull(data['shot_made_flag'])]
test_data = test_data.drop('shot_made_flag',1)

四、选择模型

这里选择集成算法中的随机森林，首先选择用多少棵树：

#这里使用随机森林模型进行预测，首先寻找最适的树的棵树
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix,accuracy_score,log_loss
import time
#logspace默认以10为底，然后以分割的数为幂，若要以2为底，在后面加上base = 2
#这里是平均分割，3是指在0和4之间平均分割3个数而不像linspace是间隔，这里应为8,16,32

range_n = np.logspace(3,5,3,base = 2).astype(int)
range_n

array([ 8, 16, 32])

静听山水

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录