记录一下对kobe职业生涯数据的数据预处理过程,对kobe表示沉痛哀悼,永远的神…
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
# import data
filename= "data.csv"
raw = pd.read_csv(filename)
print (raw.shape)
raw.head()
#raw.tail()
数据集里面有缺失值,这里用的方法是直接将此行删除
# 5000 for test 去掉5000个缺失值
kobe = raw[pd.notnull(raw['shot_made_flag'])] #保留值不为NaN的列 按shot_made_flag选
#plt.subplot(211) first is raw second Column
alpha = 0.03 #透明程度
plt.figure(figsize=(10,10)) #先画一个10*10的块
# loc_x and loc_y
plt.subplot(121) #1行2列第1个
plt.scatter(kobe.loc_x, kobe.loc_y, color='R', alpha=alpha)
plt.title('loc_x and loc_y')
# lat and lon
plt.subplot(122) #1行2列第2个
plt.scatter(kobe.lon, kobe.lat, color='B', alpha=alpha)
plt.title('lat and lon')
x、y直角坐标转化为极坐标
#xy的极坐标表示
raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)
loc_x_zero = raw['loc_x'] == 0
#print (loc_x_zero)
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2
#合并两列
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining'] #将剩余分钟化成秒加上
#将列中不同的打出来
print(kobe.action_type.unique())
print(kobe.combined_shot_type.unique())
print(kobe.shot_type.unique())
#将不同类别出现的次数打出来
print(kobe.shot_type.value_counts())
[‘Jump Shot’ ‘Driving Dunk Shot’ ‘Layup Shot’ ‘Running Jump Shot’
‘Reverse Dunk Shot’ ‘Slam Dunk Shot’ ‘Driving Layup Shot’
‘Turnaround Jump Shot’ ‘Reverse Layup Shot’ ‘Tip Shot’
‘Running Hook Shot’ ‘Alley Oop Dunk Shot’ ‘Dunk Shot’
‘Alley Oop Layup shot’ ‘Running Dunk Shot’ ‘Driving Finger Roll Shot’
‘Running Layup Shot’ ‘Finger Roll Shot’ ‘Fadeaway Jump Shot’
‘Follow Up Dunk Shot’ ‘Hook Shot’ ‘Turnaround Hook Shot’ ‘Jump Hook Shot’
‘Running Finger Roll Shot’ ‘Jump Bank Shot’ ‘Turnaround Finger Roll Shot’
‘Hook Bank Shot’ ‘Driving Hook Shot’ ‘Running Tip Shot’
‘Running Reverse Layup Shot’ ‘Driving Finger Roll Layup Shot’
‘Fadeaway Bank shot’ ‘Pullup Jump shot’ ‘Finger Roll Layup Shot’
‘Turnaround Fadeaway shot’ ‘Driving Reverse Layup Shot’
‘Driving Slam Dunk Shot’ ‘Step Back Jump shot’ ‘Turnaround Bank shot’
‘Reverse Slam Dunk Shot’ ‘Floating Jump shot’ ‘Putback Slam Dunk Shot’
‘Running Bank shot’ ‘Driving Bank shot’ ‘Driving Jump shot’
‘Putback Layup Shot’ ‘Putback Dunk Shot’ ‘Running Finger Roll Layup Shot’
‘Pullup Bank shot’ ‘Running Slam Dunk Shot’ ‘Cutting Layup Shot’
‘Driving Floating Jump Shot’ ‘Running Pull-Up Jump Shot’ ‘Tip Layup Shot’
‘Driving Floating Bank Jump Shot’]
[‘Jump Shot’ ‘Dunk’ ‘Layup’ ‘Tip Shot’ ‘Hook Shot’ ‘Bank Shot’]
[‘2PT Field Goal’ ‘3PT Field Goal’]
2PT Field Goal 20285
3PT Field Goal 5412
Name: shot_type, dtype: int64
很明显数据集里面像时间2000-01这种数据python不能直接用,需要先将它转化成整形的数据
#将带有特殊字符的数据化成整形的数据
raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]) ) #apply:对某一列用..函数
plt.figure(figsize=(5,5))
plt.scatter(raw.dist, raw.shot_distance, color='blue')
plt.title('dist and shot_distance')
发现是呈强正相关的两个指标,只需选取其中一个分析即可
gs = kobe.groupby('shot_zone_area')
print (kobe['shot_zone_area'].value_counts())
print (len(gs))
Center( C) 11289
Right Side Center(RC) 3981
Right Side( R) 3859
Left Side Center(LC) 3364
Left Side(L) 3132
Back Court(BC) 72
Name: shot_zone_area, dtype: int64
6
一些列对分析没有用,要删掉
#drop掉一些对结果没用的列
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
raw = raw.drop(drop, 1)
print (raw['combined_shot_type'].value_counts())
#机器不认识string型数据,要利用one hot编码转转换
#用get_dummies完成,将不同属性按列排
#prefix:前缀,系统会自动加上后缀 _(名字)
pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1) #将数据 列(1)组到一起,组成一个新的dataframe
raw = raw.drop(var, 1)
数据预处理完成