2021-03-24

kobe投篮预测-唐宇迪课程学习01

读取数据

(1)导入库

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

(2)导入数据

#import data
filename="C:\\Users\\admin\\Desktop\\Kobe\\data.csv"
raw=pd.read_csv(filename)
print (raw.shape)
raw.head()
# raw.tail()

运行结果:
在这里插入图片描述

(3)剔除无效值

#5000 for test  把5000个样本缺失值,当作是测验
kobe=raw[pd.notnull(raw['shot_made_flag'])]
print(kobe.shape)

运行结果:
在这里插入图片描述
(4)画图

#plt.subplot(211) first is raw, second column
alpha=0.02 #设置绘点的“透明度”
plt.figure(figsize=(10,10))

#loc_x and loc_y
plt.subplot(121)
plt.scatter(kobe.loc_x,kobe.loc_y,color='blue',alpha=alpha)
plt.title('loc_x and loc_y')

#lat and lon
plt.subplot(122)
plt.scatter(kobe.lon,kobe.lat,color='green',alpha=alpha)
plt.title('lat and lon')

在这里插入图片描述
(5)极坐标表示方法

raw['dist']=np.sqrt(raw['loc_x']**2+raw['loc_y']**2)#勾股定理

loc_x_zero=raw['loc_x']==0
#print(loc_x_zero)
raw['angle']=np.array([0]*len(raw))
raw['angle'][~loc_x_zero]=np.arctan(raw['loc_y'][~loc_x_zero]/raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero]=np.pi/2

运行结果:
在这里插入图片描述

(6)将时间(分钟+秒)统一单位(秒)

raw['remaining_time']=raw['minutes_remaining']*60+raw['seconds_remaining']
print(kobe.action_type.unique())#打印某一列数据的全部属性
print(kobe.combined_shot_type.unique())
print(kobe.shot_type.unique())
print(kobe.shot_type.value_counts())#打印每种属性,出现的次数

运行结果:
在这里插入图片描述
(7)提取“kobe出战赛季”

kobe['season'].unique()

运行结果:
在这里插入图片描述
(8)提取“kobe出战赛季”的后两位数字

raw['season']=raw['season'].apply(lambda x: int(x.split('-')[1]))
raw['season'].unique()

运行结果:
在这里插入图片描述
(9)打印kobe效力球队信息

print(kobe['team_id'].unique())
print(kobe['team_name'].unique())

在这里插入图片描述
(10)打印kobe 对手信息

pd.DataFrame({'matchup':kobe.matchup,'opponent':kobe.opponent})

运行结果:
在这里插入图片描述
(11)计算并画图“投篮距离”

plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,color='blue')
plt.title('dist and shot_distance')

运行结果:
在这里插入图片描述
(12) 打印Kobe出手距离

gs = kobe.groupby('shot_zone_area')
print (kobe['shot_zone_area'].value_counts())
print (len(gs))

运行结果:
在这里插入图片描述

(13)画图“kobe投篮距离”

import matplotlib.cm as cm
plt.figure(figsize=(20,10))
def scatter_plot_by_category(feat):
    alpha=0.1
    gs=kobe.groupby(feat)
    cs=cm.rainbow(np.linspace(0,1,len(gs)))
    for g,c in zip(gs,cs):
        plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)
    
#shot_zone_area
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')
#shot_zone_basic
plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')
#shot_zone_range
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')

在这里插入图片描述
(14)提取相关数据,并打印

drops=['shot_id','team_id','team_name','shot_zone_range','shot_zone_basic',
       'matchup','lon','lat','seconds_remaining','minutes_remaining',
       'shot_distance','loc_x','loc_y','game_event_id','game_id','game_date']
for drop in drops:
    raw=raw.drop(drop,1)

print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]

运行结果:
在这里插入图片描述
(15)

categorical_vars=['action_type','combined_shot_type','shot_type','opponent','period','season']
for var in categorical_vars:
    raw=pd.concat([raw,pd.get_dummies(raw[var],prefix=var)],1)
    raw=raw.drop(var,1)

(16)定义“训练集”与“测试集”

train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', 1)
#print(train_kobe)
#train_label = train_kobe['shot_made_flag']
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)
print(train_kobe)

运行结果:
在这里插入图片描述

(17)导入分类器、定义损失函数

from sklearn.ensemble import RandomForestRegressor#随机森林分类器
from sklearn.metrics import confusion_matrix,log_loss#混淆矩阵,损失函数
import time
# find the best n_estimators for RandomForestClassifier
print('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,num=3).astype(int)
for n in range_n:
    print("the number of trees : {0}".format(n))
    t1 = time.time()
    
    rfc_score = 0.
    rfc = RandomForestClassifier(n_estimators=n)
    
    kf = KFold(n_splits=10, random_state=None, shuffle=True) ### ++
    
    #for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True): ### --
    for train_k, test_k in kf.split(train_kobe):                             ### ++
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_n.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n
        
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))
print(best_n, min_score)


# find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
    print("the max depth : {0}".format(m))
    t1 = time.time()
    
    rfc_score = 0.
    rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
    
    kf = KFold(n_splits=10, random_state=None, shuffle=True) ### ++
    
    #for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True): ### --
    for train_k, test_k in kf.split(train_kobe):                              ### ++ 
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_m.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_m = m
    
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score

运行结果:
在这里插入图片描述

(18)find the best n_estimators for RandomForestClassifier

plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')

运行结果:
在这里插入图片描述

(19)

model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)
# 474241623

运行结果:
在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值