import numpy as np
import pandas as pd
# 展示
import matplotlib.pyplot as plt
import seaborn as sns
from pdpbox import pdp
# Sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from IPython.display import display
from sklearn import metrics
1、数据介绍
- DBNOs - Number of enemy players knocked.
- assists - Number of enemy players this player damaged that were killed by teammates.
- boosts - Number of boost items used.
- damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.
- headshotKills - Number of enemy players killed with headshots.
- heals - Number of healing items used.
- Id - Player’s Id
- killPlace - Ranking in match of number of enemy players killed.
- killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other - than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
- killStreaks - Max number of enemy players killed in a short amount of time.
- kills - Number of enemy players killed.
- longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
- matchDuration - Duration of match in seconds.
- matchId - ID to identify match. There are no matches that are in both the training and testing set.
- matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
- rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
- revives - Number of times this player revived teammates.
- rideDistance - Total distance traveled in vehicles measured in meters.
- roadKills - Number of kills while in a vehicle.
- swimDistance - Total distance traveled by swimming measured in meters.
- teamKills - Number of times this player killed a teammate.
- vehicleDestroys - Number of vehicles destroyed.
- walkDistance - Total distance traveled on foot measured in meters.
- weaponsAcquired - Number of weapons picked up.
- winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
- groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
- numGroups - Number of groups we have data for in the match.
- maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
- winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.
train = pd.read_csv('train_V2.csv')
test = pd.read_csv('test_V2.csv')
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0 | 0.0000 | 0 | 0.00 | 0 | 0 | 244.80 | 1 | 1466 | 0.4444 |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0 | 0.0045 | 0 | 11.04 | 0 | 0 | 1434.00 | 5 | 0 | 0.6400 |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0 | 0.0000 | 0 | 0.00 | 0 | 0 | 161.80 | 2 | 0 | 0.7755 |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0 | 0.0000 | 0 | 0.00 | 0 | 0 | 202.70 | 3 | 0 | 0.1667 |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0 | 0.0000 | 0 | 0.00 | 0 | 0 | 49.75 | 2 | 0 | 0.1875 |
5 rows × 29 columns
2、简单清洗数据
2.1 清洗空数据
train[train['winPlacePerc'].isnull()]
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2744604 | f70c74418bb064 | 12dfbede33f92b | 224a123c53e008 | 0 | 0 | 0.0 | 0 | 0 | 0 | 1 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | NaN |
1 rows × 29 columns
train.drop(2744604,inplace=True)
train[train['winPlacePerc'].isnull()]
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc |
---|
0 rows × 29 columns
2.2 去除每组人数不同对战绩的影响
train['playersJoined'] = train.groupby('matchId')['matchId'].transform('count')
plt.figure(figsize=(15,10))
sns.countplot(train[train['playersJoined']>=75]['playersJoined'])
'''sns.countplot 是 Seaborn 库中的一个函数,用于绘制分类变量的计数直方图。它主要用于统计每个类别中数据出现的次数,并将结果以直方图的形式可视化展示出来。
这个函数的调用形式通常是 sns.countplot(x='variable', data=data),其中 x 是分类变量的名称,data 是包含数据的 DataFrame 或其他数据结构。'''
plt.show()
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | playersJoined | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 244.80 | 1 | 1466 | 0.4444 | 96 |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0.0045 | 0 | 11.04 | 0 | 0 | 1434.00 | 5 | 0 | 0.6400 | 91 |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 161.80 | 2 | 0 | 0.7755 | 98 |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 202.70 | 3 | 0 | 0.1667 | 91 |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 49.75 | 2 | 0 | 0.1875 | 97 |
5 rows × 30 columns
train['killsNorm'] = train['kills']*((100-train['playersJoined'])/100+1)
train['damageDealtNorm'] = train['damageDealt']*((100-train['playersJoined'])/100+1)
train['matchDurationNorm'] = train['matchDuration']*((100-train['playersJoined'])/100+1)
to_show = ['Id', 'kills','killsNorm','damageDealt', 'damageDealtNorm', 'matchDuration', 'matchDurationNorm']
train[to_show][:11]
Id | kills | killsNorm | damageDealt | damageDealtNorm | matchDuration | matchDurationNorm | |
---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 0 | 0.00 | 0.000 | 0.00000 | 1306 | 1358.24 |
1 | eef90569b9d03c | 0 | 0.00 | 91.470 | 99.70230 | 1777 | 1936.93 |
2 | 1eaf90ac73de72 | 0 | 0.00 | 68.000 | 69.36000 | 1318 | 1344.36 |
3 | 4616d365dd2853 | 0 | 0.00 | 32.900 | 35.86100 | 1436 | 1565.24 |
4 | 315c96c26c9aac | 1 | 1.03 | 100.000 | 103.00000 | 1424 | 1466.72 |
5 | ff79c12f326506 | 1 | 1.05 | 100.000 | 105.00000 | 1395 | 1464.75 |
6 | 95959be0e21ca3 | 0 | 0.00 | 0.000 | 0.00000 | 1316 | 1355.48 |
7 | 311b84c6ff4390 | 0 | 0.00 | 8.538 | 8.87952 | 1967 | 2045.68 |
8 | 1a68204ccf9891 | 0 | 0.00 | 51.600 | 53.14800 | 1375 | 1416.25 |
9 | e5bb5a43587253 | 0 | 0.00 | 37.270 | 38.38810 | 1930 | 1987.90 |
10 | 2b574d43972813 | 0 | 0.00 | 28.380 | 28.66380 | 1811 | 1829.11 |
3、对于开挂的,应当给他们剔除掉
3.1 第一种外挂:没有步数便击杀人头的
train['totalDistance'] = train['rideDistance'] + train['walkDistance'] + train['swimDistance']
train['killWithoutMoving'] = ((train['kills']>0)&(train['totalDistance']==0))
#train[train['killWithoutMoving']==True].shape
train.drop(train[train['killWithoutMoving']==True].index,inplace=True)
3.2 第二种外挂:坐在车上便能击杀人头的
train[train['roadKills']>10].shape
(4, 35)
train.drop(train[train['roadKills']>10].index,inplace=True)
3.3 第三种外挂:击杀人头数高的离谱的
plt.figure(figsize=(15,8))
sns.countplot(data=train,x=train['kills']).set_title('Kills')
plt.show()
train[train['kills']>30].shape
train.drop(train[train['kills']>30].index,inplace=True)
3.4 第四种外挂:爆头击杀率很高的且击杀人头数多的
'''
headshotKills - Number of enemy players killed with headshots.#爆头击杀,直中要害
kills - Number of enemy players killed.#被击杀
'''
train['headshot_rate'] = train['headshotKills']/train['kills']
train['headshot_rate'] = train['headshot_rate'].fillna(0)
plt.figure(figsize=(15,10))
sns.distplot(train['headshot_rate'],bins=10)
plt.show()
train[train['headshot_rate']==1].shape
(253959, 36)
train[ (train['headshot_rate']==1) & (train['kills']==1) ].shape
(218433, 36)
train.drop(train[ (train['headshot_rate']==1) & (train['kills']>=5) ].index,inplace=True)
4、使用categorical变量,减少系统占用内存
train['matchId'] = train['matchId'].astype('category')
train['groupId'] = train['groupId'].astype('category')
5、查看不同模式下击杀人头数与胜率关系
train.drop(columns=['Id'],inplace=True)
5.1 单排,双排,四排
solos=train[train['numGroups']>50]
duos=train[(train['numGroups']>25)&(train['numGroups']<=50)]
squads=train[train['numGroups']<=25]
len(solos)/len(train)
0.15947449945499398
len(duos)/len(train)
0.7412968331156684
f,ax =plt.subplots(figsize=(20,10))
sns.pointplot(x='kills',y='winPlacePerc',data=solos,color='black',alpha=0.8)
sns.pointplot(x='kills',y='winPlacePerc',data=duos,color='red',alpha=0.8)
sns.pointplot(x='kills',y='winPlacePerc',data=squads,color='blue',alpha=0.8)
plt.text(25,0.5,'Solos',color='red')
plt.grid()
plt.show()
6 、热力图
k = 5
f,ax =plt.subplots(figsize=(12,12))
temp_train=train.drop(columns=['groupId','matchId','matchType'])
cols = temp_train.corr().nlargest(k,'winPlacePerc')['winPlacePerc'].index
#train.corr().nlargest(k,'winPlacePerc')['winPlacePerc']返回一个与'winPlacePerc'相关性最大的5分变量以及相关系数
# a 0.09
# b 0.01
# c 0.98
#此时相关系数的名称为'winPlacePerc'
cm = np.corrcoef(temp_train[cols].values.T)
sns.heatmap(cm,annot=True,linewidths=0.5,fmt='.1f',ax=ax,yticklabels=cols.values,xticklabels=cols.values)
plt.show()
7、建模
sample = 500000
df_sample = train.sample(sample)
df_sample.drop(columns = ['groupId','matchId','matchType'],inplace=True)
df = df_sample.drop(columns=['winPlacePerc'])
y = df_sample['winPlacePerc']
7.1 将训练集分为训练部分与测试部分
X_train,X_valid,y_train,y_valid = train_test_split(df,y,random_state=1)
7.2 训练测试集
def print_score(m):
res= ['mae train',mean_absolute_error(m.predict(X_train),y_train),
'mae val',mean_absolute_error(m.predict(X_valid),y_valid)]
print (res)
from sklearn.metrics import mean_absolute_error
m1 = RandomForestRegressor(n_estimators=50,n_jobs=-1)
m1.fit(X_train,y_train)
print_score(m1)
['mae train', 0.02192784867733333, 'mae val', 0.05833728120000001]
m1.feature_importances_
array([1.39260048e-03, 5.35238158e-03, 3.30978196e-03, 2.28083086e-03,
4.01083836e-04, 2.68734056e-03, 1.81916824e-01, 2.36602622e-03,
3.11364071e-03, 2.75269723e-03, 6.13427821e-03, 9.33855372e-03,
5.53881534e-03, 1.20290463e-02, 4.37883164e-03, 8.08006977e-04,
1.91166905e-03, 4.32997862e-05, 7.26137113e-04, 2.31430733e-04,
8.58912251e-05, 6.76056200e-01, 3.81913691e-03, 2.60518373e-03,
1.84332720e-02, 8.68569187e-03, 3.87405597e-03, 1.14654252e-02,
2.75947316e-02, 0.00000000e+00, 6.67135806e-04])
def rf_feat_importance(m,df):
return pd.DataFrame({'cols':df.columns,'imp':m.feature_importances_}).sort_values('imp',ascending=False)
rf_feat_importance(m1,df)
cols | imp | |
---|---|---|
21 | walkDistance | 0.676056 |
6 | killPlace | 0.181917 |
28 | totalDistance | 0.027595 |
24 | playersJoined | 0.018433 |
13 | numGroups | 0.012029 |
27 | matchDurationNorm | 0.011465 |
11 | matchDuration | 0.009339 |
25 | killsNorm | 0.008686 |
10 | longestKill | 0.006134 |
12 | maxPlace | 0.005539 |
1 | boosts | 0.005352 |
14 | rankPoints | 0.004379 |
26 | damageDealtNorm | 0.003874 |
22 | weaponsAcquired | 0.003819 |
2 | damageDealt | 0.003310 |
8 | kills | 0.003114 |
9 | killStreaks | 0.002753 |
5 | heals | 0.002687 |
23 | winPoints | 0.002605 |
7 | killPoints | 0.002366 |
3 | DBNOs | 0.002281 |
16 | rideDistance | 0.001912 |
0 | assists | 0.001393 |
15 | revives | 0.000808 |
18 | swimDistance | 0.000726 |
30 | headshot_rate | 0.000667 |
4 | headshotKills | 0.000401 |
19 | teamKills | 0.000231 |
20 | vehicleDestroys | 0.000086 |
17 | roadKills | 0.000043 |
29 | killWithoutMoving | 0.000000 |
rf_feat_importance(m1,df)[:10].plot('cols','imp',figsize=(14,6),kind='barh')
plt.show()
fi=rf_feat_importance(m1,df)
to_keep = fi[fi.imp>0.01].cols
to_keep
21 walkDistance
6 killPlace
28 totalDistance
24 playersJoined
13 numGroups
27 matchDurationNorm
Name: cols, dtype: object
7.3 用取出的重要指标计算训练集误差
X_train,X_valid = X_train[to_keep],X_valid[to_keep]
m2 = RandomForestRegressor(n_estimators=50,n_jobs=-1)
m2.fit(X_train,y_train)
print_score(m2)
['mae train', 0.02192277282133332, 'mae val', 0.05834604831999999]
8、预测测试集
temp_test=test.copy()
temp_test.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | rankPoints | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9329eb41e215eb | 676b23c24e70d6 | 45b576ab7daa7f | 0 | 0 | 51.46 | 0 | 0 | 0 | 73 | ... | 1500 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 588.0 | 1 | 0 |
1 | 639bd0dcd7bda8 | 430933124148dd | 42a9a0b906c928 | 0 | 4 | 179.10 | 0 | 0 | 2 | 11 | ... | 1503 | 2 | 4669.0 | 0 | 0.0 | 0 | 0 | 2017.0 | 6 | 0 |
2 | 63d5c8ef8dfe91 | 0b45f5db20ba99 | 87e7e4477a048e | 1 | 0 | 23.40 | 0 | 0 | 4 | 49 | ... | 1565 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 787.8 | 4 | 0 |
3 | cf5b81422591d1 | b7497dbdc77f4a | 1b9a94f1af67f1 | 0 | 0 | 65.52 | 0 | 0 | 0 | 54 | ... | 1465 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 1812.0 | 3 | 0 |
4 | ee6a295187ba21 | 6604ce20a1d230 | 40754a93016066 | 0 | 4 | 330.20 | 1 | 2 | 1 | 7 | ... | 1480 | 1 | 0.0 | 0 | 0.0 | 0 | 0 | 2963.0 | 4 | 0 |
5 rows × 28 columns
temp_test只有三列是自带的,其他列均需要处理下
21 walkDistance 25
6 killPlace 9
28 totalDistance
24 playersJoined
13 numGroups 17
27 matchDurationNorm
temp_test['totalDistance']=temp_test['rideDistance'] + temp_test['walkDistance'] + temp_test['swimDistance']
temp_test['playersJoined'] = temp_test.groupby('matchId')['matchId'].transform('count')
temp_test['matchDurationNorm'] = temp_test['matchDuration']*((100-temp_test['playersJoined'])/100+1)
21 walkDistance 25
6 killPlace 9
28 totalDistance 28
24 playersJoined 29
13 numGroups 17
27 matchDurationNorm 30
#训练一个只有特征和指标的模型
X=X_train[['walkDistance','killPlace','totalDistance','playersJoined','numGroups','matchDurationNorm']]
fratures_model = RandomForestRegressor(n_estimators=50,n_jobs=-1)
fratures_model.fit(X,y_train)
temp_test['winPlacePerc']=fratures_model.predict(temp_test[['walkDistance','killPlace','totalDistance','playersJoined','numGroups','matchDurationNorm']])
temp_test.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | totalDistance | playersJoined | matchDurationNorm | winPlacePerc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9329eb41e215eb | 676b23c24e70d6 | 45b576ab7daa7f | 0 | 0 | 51.46 | 0 | 0 | 0 | 73 | ... | 0.0 | 0 | 0 | 588.0 | 1 | 0 | 588.0 | 92 | 2034.72 | 0.183950 |
1 | 639bd0dcd7bda8 | 430933124148dd | 42a9a0b906c928 | 0 | 4 | 179.10 | 0 | 0 | 2 | 11 | ... | 0.0 | 0 | 0 | 2017.0 | 6 | 0 | 6686.0 | 96 | 1883.44 | 0.858322 |
2 | 63d5c8ef8dfe91 | 0b45f5db20ba99 | 87e7e4477a048e | 1 | 0 | 23.40 | 0 | 0 | 4 | 49 | ... | 0.0 | 0 | 0 | 787.8 | 4 | 0 | 787.8 | 94 | 1900.58 | 0.718864 |
3 | cf5b81422591d1 | b7497dbdc77f4a | 1b9a94f1af67f1 | 0 | 0 | 65.52 | 0 | 0 | 0 | 54 | ... | 0.0 | 0 | 0 | 1812.0 | 3 | 0 | 1812.0 | 89 | 2035.74 | 0.520244 |
4 | ee6a295187ba21 | 6604ce20a1d230 | 40754a93016066 | 0 | 4 | 330.20 | 1 | 2 | 1 | 7 | ... | 0.0 | 0 | 0 | 2963.0 | 4 | 0 | 2963.0 | 95 | 1392.30 | 0.881686 |
5 rows × 32 columns