项目实现
- 你必须创建⼀个模型,根据他们的最终统计数据预测玩家的排名,从1(第⼀名)到0(最后⼀名)。
- 最后结果通过平均绝对误差(MAE)进⾏评估,即通过预测的winPlacePerc和真实的winPlacePerc之间的平均绝对误 差
- 关于MAE: sklearn.metrics.mean_absolute_error
获取数据、基本数据信息查看
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
train = pd.read_csv("./data/train_V2.csv")
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0 | 0.0000 | 0 | 0.00 | 0 | 0 | 244.80 | 1 | 1466 | 0.4444 |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0 | 0.0045 | 0 | 11.04 | 0 | 0 | 1434.00 | 5 | 0 | 0.6400 |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0 | 0.0000 | 0 | 0.00 | 0 | 0 | 161.80 | 2 | 0 | 0.7755 |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0 | 0.0000 | 0 | 0.00 | 0 | 0 | 202.70 | 3 | 0 | 0.1667 |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0 | 0.0000 | 0 | 0.00 | 0 | 0 | 49.75 | 2 | 0 | 0.1875 |
5 rows × 29 columns
train.tail()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4446961 | afff7f652dbc10 | d238e426f50de7 | 18492834ce5635 | 0 | 0 | 0.00 | 0 | 0 | 0 | 74 | ... | 0 | 1292.0 | 0 | 0.000 | 0 | 0 | 1019.0 | 3 | 1507 | 0.1786 |
4446962 | f4197cf374e6c0 | 408cdb5c46b2ac | ee854b837376d9 | 0 | 1 | 44.15 | 0 | 0 | 0 | 69 | ... | 0 | 0.0 | 0 | 0.000 | 0 | 0 | 81.7 | 6 | 0 | 0.2935 |
4446963 | e1948b1295c88a | e26ac84bdf7cef | 6d0cd12784f1ab | 0 | 0 | 59.06 | 0 | 0 | 0 | 66 | ... | 0 | 0.0 | 0 | 2.184 | 0 | 0 | 788.7 | 4 | 0 | 0.4815 |
4446964 | cc032cdd73b7ac | c2223f35411394 | c9c701d0ad758a | 0 | 4 | 180.40 | 1 | 1 | 2 | 11 | ... | 2 | 0.0 | 0 | 0.000 | 0 | 0 | 2748.0 | 8 | 0 | 0.8000 |
4446965 | 0d8e7ed728b6fd | 8c74f72fedf5ff | 62a16aabcc095c | 0 | 2 | 268.00 | 0 | 0 | 1 | 18 | ... | 0 | 1369.0 | 0 | 0.000 | 0 | 0 | 1244.0 | 5 | 0 | 0.5464 |
5 rows × 29 columns
train.describe()
assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | killPoints | kills | killStreaks | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | ... | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446966e+06 | 4.446965e+06 |
mean | 2.338149e-01 | 1.106908e+00 | 1.307171e+02 | 6.578755e-01 | 2.268196e-01 | 1.370147e+00 | 4.759935e+01 | 5.050060e+02 | 9.247833e-01 | 5.439551e-01 | ... | 1.646590e-01 | 6.061157e+02 | 3.496091e-03 | 4.509322e+00 | 2.386841e-02 | 7.918208e-03 | 1.154218e+03 | 3.660488e+00 | 6.064601e+02 | 4.728216e-01 |
std | 5.885731e-01 | 1.715794e+00 | 1.707806e+02 | 1.145743e+00 | 6.021553e-01 | 2.679982e+00 | 2.746294e+01 | 6.275049e+02 | 1.558445e+00 | 7.109721e-01 | ... | 4.721671e-01 | 1.498344e+03 | 7.337297e-02 | 3.050220e+01 | 1.673935e-01 | 9.261157e-02 | 1.183497e+03 | 2.456544e+00 | 7.397004e+02 | 3.074050e-01 |
min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | ... | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
25% | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.400000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | ... | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.551000e+02 | 2.000000e+00 | 0.000000e+00 | 2.000000e-01 |
50% | 0.000000e+00 | 0.000000e+00 | 8.424000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 4.700000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | ... | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 6.856000e+02 | 3.000000e+00 | 0.000000e+00 | 4.583000e-01 |
75% | 0.000000e+00 | 2.000000e+00 | 1.860000e+02 | 1.000000e+00 | 0.000000e+00 | 2.000000e+00 | 7.100000e+01 | 1.172000e+03 | 1.000000e+00 | 1.000000e+00 | ... | 0.000000e+00 | 1.909750e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.976000e+03 | 5.000000e+00 | 1.495000e+03 | 7.407000e-01 |
max | 2.200000e+01 | 3.300000e+01 | 6.616000e+03 | 5.300000e+01 | 6.400000e+01 | 8.000000e+01 | 1.010000e+02 | 2.170000e+03 | 7.200000e+01 | 2.000000e+01 | ... | 3.900000e+01 | 4.071000e+04 | 1.800000e+01 | 3.823000e+03 | 1.200000e+01 | 5.000000e+00 | 2.578000e+04 | 2.360000e+02 | 2.013000e+03 | 1.000000e+00 |
8 rows × 25 columns
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4446966 entries, 0 to 4446965 Data columns (total 29 columns): Id object groupId object matchId object assists int64 boosts int64 damageDealt float64 DBNOs int64 headshotKills int64 heals int64 killPlace int64 killPoints int64 kills int64 killStreaks int64 longestKill float64 matchDuration int64 matchType object maxPlace int64 numGroups int64 rankPoints int64 revives int64 rideDistance float64 roadKills int64 swimDistance float64 teamKills int64 vehicleDestroys int64 walkDistance float64 weaponsAcquired int64 winPoints int64 winPlacePerc float64 dtypes: float64(6), int64(19), object(4) memory usage: 983.9+ MB
# 查看一共要多少条数据
train.shape
(4446966, 29)
# 有多少场比赛
np.unique(train["matchId"]).shape
(47965,)
# 有多少支队伍
np.unique(train["groupId"]).shape
(2026745,)
数据基本处理
数据缺失值处理
# 判断哪列有缺失值,发现只有winPlacePerc有
np.any(train.isnull())
Id False groupId False matchId False assists False boosts False damageDealt False DBNOs False headshotKills False heals False killPlace False killPoints False kills False killStreaks False longestKill False matchDuration False matchType False maxPlace False numGroups False rankPoints False revives False rideDistance False roadKills False swimDistance False teamKills False vehicleDestroys False walkDistance False weaponsAcquired False winPoints False winPlacePerc True dtype: bool
# 查找缺失值
train[train["winPlacePerc"].isnull()]
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | revives | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2744604 | f70c74418bb064 | 12dfbede33f92b | 224a123c53e008 | 0 | 0 | 0.0 | 0 | 0 | 0 | 1 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0 | NaN |
1 rows × 29 columns
# 删除
train = train.drop(2744604)
train.shape
(4446965, 29)
特征数据规范化处理
查看每场比赛参加的人数
count = train.groupby("matchId")["matchId"].transform("count")
train["playersJoined"] = count
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | rideDistance | roadKills | swimDistance | teamKills | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | playersJoined | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 244.80 | 1 | 1466 | 0.4444 | 96 |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0.0045 | 0 | 11.04 | 0 | 0 | 1434.00 | 5 | 0 | 0.6400 | 91 |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 161.80 | 2 | 0 | 0.7755 | 98 |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 202.70 | 3 | 0 | 0.1667 | 91 |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0.0000 | 0 | 0.00 | 0 | 0 | 49.75 | 2 | 0 | 0.1875 | 97 |
5 rows × 30 columns
train["playersJoined"].sort_values().head()
1206365 2 2109739 2 3956552 5 3620228 5 696000 5 Name: playersJoined, dtype: int64
plt.figure(figsize=(20, 8))
sns.countplot(train["playersJoined"])
plt.grid()
plt.show()
# train[train["playersJoined"]>=75]["playersJoined"]
plt.figure(figsize=(20, 8))
sns.countplot(train[train["playersJoined"]>=75]["playersJoined"])
plt.grid()
plt.show()
规范化输出部分数据
train["killsNorm"] = train["kills"] * ((100-train["playersJoined"])/100+1)
train["damageDealtNorm"] = train["damageDealt"] * ((100-train["playersJoined"])/100+1)
train["maxPlaceNorm"] = train["maxPlace"] * ((100-train["playersJoined"])/100+1)
train["matchDurationNorm"] = train["matchDuration"] * ((100-train["playersJoined"])/100+1)
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | vehicleDestroys | walkDistance | weaponsAcquired | winPoints | winPlacePerc | playersJoined | killsNorm | damageDealtNorm | maxPlaceNorm | matchDurationNorm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0 | 244.80 | 1 | 1466 | 0.4444 | 96 | 0.00 | 0.0000 | 29.12 | 1358.24 |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0 | 1434.00 | 5 | 0 | 0.6400 | 91 | 0.00 | 99.7023 | 28.34 | 1936.93 |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0 | 161.80 | 2 | 0 | 0.7755 | 98 | 0.00 | 69.3600 | 51.00 | 1344.36 |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0 | 202.70 | 3 | 0 | 0.1667 | 91 | 0.00 | 35.8610 | 33.79 | 1565.24 |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0 | 49.75 | 2 | 0 | 0.1875 | 97 | 1.03 | 103.0000 | 99.91 | 1466.72 |
5 rows × 34 columns
# 比较经过规范化的特征值和原始特征值的值
to_show = ['Id', 'kills','killsNorm','damageDealt', 'damageDealtNorm', 'maxPlace', 'maxPlaceNorm', 'matchDuration', 'matchDurationNorm']
train[to_show][0:11]
Id | kills | killsNorm | damageDealt | damageDealtNorm | maxPlace | maxPlaceNorm | matchDuration | matchDurationNorm | |
---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 0 | 0.00 | 0.000 | 0.00000 | 28 | 29.12 | 1306 | 1358.24 |
1 | eef90569b9d03c | 0 | 0.00 | 91.470 | 99.70230 | 26 | 28.34 | 1777 | 1936.93 |
2 | 1eaf90ac73de72 | 0 | 0.00 | 68.000 | 69.36000 | 50 | 51.00 | 1318 | 1344.36 |
3 | 4616d365dd2853 | 0 | 0.00 | 32.900 | 35.86100 | 31 | 33.79 | 1436 | 1565.24 |
4 | 315c96c26c9aac | 1 | 1.03 | 100.000 | 103.00000 | 97 | 99.91 | 1424 | 1466.72 |
5 | ff79c12f326506 | 1 | 1.05 | 100.000 | 105.00000 | 28 | 29.40 | 1395 | 1464.75 |
6 | 95959be0e21ca3 | 0 | 0.00 | 0.000 | 0.00000 | 28 | 28.84 | 1316 | 1355.48 |
7 | 311b84c6ff4390 | 0 | 0.00 | 8.538 | 8.87952 | 96 | 99.84 | 1967 | 2045.68 |
8 | 1a68204ccf9891 | 0 | 0.00 | 51.600 | 53.14800 | 28 | 28.84 | 1375 | 1416.25 |
9 | e5bb5a43587253 | 0 | 0.00 | 37.270 | 38.38810 | 29 | 29.87 | 1930 | 1987.90 |
10 | 2b574d43972813 | 0 | 0.00 | 28.380 | 28.66380 | 29 | 29.29 | 1811 | 1829.11 |
部分变量合成
train["healsandboosts"] = train["heals"] + train["boosts"]
train[["heals", "boosts", "healsandboosts"]].tail(10)
heals | boosts | healsandboosts | |
---|---|---|---|
4446956 | 1 | 0 | 1 |
4446957 | 0 | 1 | 1 |
4446958 | 0 | 0 | 0 |
4446959 | 0 | 0 | 0 |
4446960 | 0 | 0 | 0 |
4446961 | 0 | 0 | 0 |
4446962 | 0 | 1 | 1 |
4446963 | 0 | 0 | 0 |
4446964 | 2 | 4 | 6 |
4446965 | 1 | 2 | 3 |
异常值处理
异常值处理:删除有击杀,但是完全没有移动的玩家
train["totalDistance"] = train["rideDistance"] + train["walkDistance"] + train["swimDistance"]
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | winPoints | winPlacePerc | playersJoined | killsNorm | damageDealtNorm | maxPlaceNorm | matchDurationNorm | healsandboosts | totalDistance | killwithoutMoving | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 1466 | 0.4444 | 96 | 0.00 | 0.0000 | 29.12 | 1358.24 | 0 | 244.8000 | False |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0 | 0.6400 | 91 | 0.00 | 99.7023 | 28.34 | 1936.93 | 0 | 1445.0445 | False |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0 | 0.7755 | 98 | 0.00 | 69.3600 | 51.00 | 1344.36 | 0 | 161.8000 | False |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0 | 0.1667 | 91 | 0.00 | 35.8610 | 33.79 | 1565.24 | 0 | 202.7000 | False |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0 | 0.1875 | 97 | 1.03 | 103.0000 | 99.91 | 1466.72 | 0 | 49.7500 | False |
5 rows × 37 columns
# (train["kills"] > 0) & (train["totalDistance"] == 0)
train["killwithoutMoving"] = (train["kills"] > 0) & (train["totalDistance"] == 0)
train[train["killwithoutMoving"] == True].head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | winPoints | winPlacePerc | playersJoined | killsNorm | damageDealtNorm | maxPlaceNorm | matchDurationNorm | healsandboosts | totalDistance | killwithoutMoving | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1824 | b538d514ef2476 | 0eb2ce2f43f9d6 | 35e7d750e442e2 | 0 | 0 | 593.0 | 0 | 0 | 3 | 18 | ... | 0 | 0.8571 | 58 | 8.52 | 842.060 | 21.30 | 842.06 | 3 | 0.0 | True |
6673 | 6d3a61da07b7cb | 2d8119b1544f87 | 904cecf36217df | 2 | 0 | 346.6 | 0 | 0 | 6 | 33 | ... | 0 | 0.6000 | 42 | 4.74 | 547.628 | 17.38 | 2834.52 | 6 | 0.0 | True |
11892 | 550398a8f33db7 | c3fd0e2abab0af | db6f6d1f0d4904 | 2 | 0 | 1750.0 | 0 | 4 | 5 | 3 | ... | 0 | 0.8947 | 21 | 35.80 | 3132.500 | 35.80 | 1607.42 | 5 | 0.0 | True |
14631 | 58d690ee461e9d | ea5b6630b33d67 | dbf34301df5e53 | 0 | 0 | 157.8 | 0 | 0 | 0 | 69 | ... | 1500 | 0.0000 | 73 | 1.27 | 200.406 | 24.13 | 1014.73 | 0 | 0.0 | True |
15591 | 49b61fc963d632 | 0f5c5f19d9cc21 | 904cecf36217df | 0 | 0 | 100.0 | 0 | 1 | 0 | 37 | ... | 0 | 0.3000 | 42 | 1.58 | 158.000 | 17.38 | 2834.52 | 0 | 0.0 | True |
5 rows × 37 columns
train[train["killwithoutMoving"] == True].shape
(1535, 37)
train[train["killwithoutMoving"] == True].index
Int64Index([ 1824, 6673, 11892, 14631, 15591, 20881, 23298, 24640, 25659, 30079, ... 4426500, 4429697, 4432954, 4436511, 4437516, 4440232, 4440898, 4440927, 4441511, 4446682], dtype='int64', length=1535)
train.drop(train[train["killwithoutMoving"] == True].index, inplace=True)
train.shape
(4445430, 37)
异常值处理:删除驾车杀敌数异常的数据
# train["roadKills"] > 10
train.drop(train[train["roadKills"] > 10].index, inplace=True)
train.shape
(4445426, 37)
异常值处理:删除玩家在一局中杀敌数超过30人的数据
train[train["kills"] > 30].head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | winPoints | winPlacePerc | playersJoined | killsNorm | damageDealtNorm | maxPlaceNorm | matchDurationNorm | healsandboosts | totalDistance | killwithoutMoving | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
57978 | 9d8253e21ccbbd | ef7135ed856cd8 | 37f05e2a01015f | 9 | 0 | 3725.0 | 0 | 7 | 0 | 2 | ... | 1500 | 0.8571 | 16 | 64.40 | 6854.00 | 14.72 | 3308.32 | 0 | 48.82 | False |
87793 | 45f76442384931 | b3627758941d34 | 37f05e2a01015f | 8 | 0 | 3087.0 | 0 | 8 | 27 | 3 | ... | 1500 | 1.0000 | 16 | 57.04 | 5680.08 | 14.72 | 3308.32 | 27 | 780.70 | False |
156599 | 746aa7eabf7c86 | 5723e7d8250da3 | f900de1ec39fa5 | 21 | 0 | 5479.0 | 0 | 12 | 7 | 4 | ... | 0 | 0.7000 | 11 | 90.72 | 10355.31 | 20.79 | 3398.22 | 7 | 23.71 | False |
160254 | 15622257cb44e2 | 1a513eeecfe724 | db413c7c48292c | 1 | 0 | 4033.0 | 0 | 40 | 0 | 1 | ... | 1500 | 1.0000 | 62 | 57.96 | 5565.54 | 11.04 | 1164.72 | 0 | 718.30 | False |
180189 | 1355613d43e2d0 | f863cd38c61dbf | 39c442628f5df5 | 5 | 0 | 3171.0 | 0 | 6 | 15 | 1 | ... | 0 | 1.0000 | 11 | 66.15 | 5993.19 | 17.01 | 3394.44 | 15 | 71.51 | False |
5 rows × 37 columns
train.drop(train[train["kills"] > 30].index, inplace=True)
train.shape
(4445331, 37)
异常值处理:删除爆头率异常数据
train["headshot_rate"] = train["headshotKills"]/train["kills"]
train["headshot_rate"].head()
0 NaN 1 NaN 2 NaN 3 NaN 4 0.0 Name: headshot_rate, dtype: float64
train["headshot_rate"] = train["headshot_rate"].fillna(0)
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | winPlacePerc | playersJoined | killsNorm | damageDealtNorm | maxPlaceNorm | matchDurationNorm | healsandboosts | totalDistance | killwithoutMoving | headshot_rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0.4444 | 96 | 0.00 | 0.0000 | 29.12 | 1358.24 | 0 | 244.8000 | False | 0.0 |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0.6400 | 91 | 0.00 | 99.7023 | 28.34 | 1936.93 | 0 | 1445.0445 | False | 0.0 |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0.7755 | 98 | 0.00 | 69.3600 | 51.00 | 1344.36 | 0 | 161.8000 | False | 0.0 |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0.1667 | 91 | 0.00 | 35.8610 | 33.79 | 1565.24 | 0 | 202.7000 | False | 0.0 |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0.1875 | 97 | 1.03 | 103.0000 | 99.91 | 1466.72 | 0 | 49.7500 | False | 0.0 |
5 rows × 38 columns
train[(train["headshot_rate"] == 1) & (train["kills"] > 9)].head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | winPlacePerc | playersJoined | killsNorm | damageDealtNorm | maxPlaceNorm | matchDurationNorm | healsandboosts | totalDistance | killwithoutMoving | headshot_rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
281570 | ab9d7168570927 | add05ebde0214c | e016a873339c7b | 2 | 3 | 1212.0 | 8 | 10 | 0 | 1 | ... | 0.8462 | 93 | 10.70 | 1296.84 | 28.89 | 1522.61 | 3 | 2939.0 | False | 1.0 |
346124 | 044d18fc42fc75 | fc1dbc2df6a887 | 628107d4c41084 | 3 | 5 | 1620.0 | 13 | 11 | 3 | 1 | ... | 1.0000 | 96 | 11.44 | 1684.80 | 28.08 | 1796.08 | 8 | 8142.0 | False | 1.0 |
871244 | e668a25f5488e3 | 5ba8feabfb2a23 | f6e6581e03ba4f | 0 | 4 | 1365.0 | 9 | 13 | 0 | 1 | ... | 1.0000 | 98 | 13.26 | 1392.30 | 27.54 | 1280.10 | 4 | 2105.0 | False | 1.0 |
908815 | 566d8218b705aa | a9b056478d71b2 | 3a41552d553583 | 2 | 5 | 1535.0 | 10 | 10 | 3 | 1 | ... | 0.9630 | 95 | 10.50 | 1611.75 | 29.40 | 1929.90 | 8 | 7948.0 | False | 1.0 |
963463 | 1bd6fd288df4f0 | 90584ffa22fe15 | ba2de992ec7bb8 | 2 | 6 | 1355.0 | 12 | 10 | 2 | 1 | ... | 1.0000 | 96 | 10.40 | 1409.20 | 28.08 | 1473.68 | 8 | 3476.0 | False | 1.0 |
5 rows × 38 columns
train[(train["headshot_rate"] == 1) & (train["kills"] > 9)].index
Int64Index([ 281570, 346124, 871244, 908815, 963463, 1079403, 1167959, 1348164, 1380385, 1483199, 1581850, 1622232, 1753322, 2256755, 2375749, 2647056, 2825200, 3288424, 3594399, 3926325, 4036281, 4351048, 4387092, 4428741], dtype='int64')
train.drop(train[(train["headshot_rate"] == 1) & (train["kills"] > 9)].index, inplace=True)
train.shape
(4445307, 38)
异常值处理:删除最远杀敌距离异常数据
train[train["longestKill"] >=1000]
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | winPlacePerc | playersJoined | killsNorm | damageDealtNorm | maxPlaceNorm | matchDurationNorm | healsandboosts | totalDistance | killwithoutMoving | headshot_rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
202281 | 88e2af7d78af5a | 34ddeede52c042 | 4346bc63bc67fa | 0 | 3 | 783.9 | 5 | 1 | 1 | 5 | ... | 0.9231 | 88 | 4.48 | 877.968 | 30.24 | 2087.68 | 4 | 3775.20 | False | 0.250000 |
240005 | 41c2f5c0699807 | 9faecf87ab4275 | 634edab75860b3 | 5 | 0 | 1284.0 | 8 | 5 | 7 | 18 | ... | 0.5385 | 29 | 18.81 | 2195.640 | 23.94 | 2236.68 | 7 | 48.87 | False | 0.454545 |
324313 | ef390c152bcc3d | 30fd444be3bbc1 | 4f7f8d6cf558b4 | 2 | 0 | 1028.0 | 0 | 0 | 0 | 9 | ... | 1.0000 | 51 | 14.90 | 1531.720 | 19.37 | 1040.02 | 0 | 2981.00 | False | 0.000000 |
656553 | 9948b058562163 | c8cb8491112bf6 | 0104eeb664494d | 6 | 0 | 1410.0 | 17 | 5 | 0 | 3 | ... | 0.6000 | 41 | 25.44 | 2241.900 | 9.54 | 1734.69 | 0 | 29.21 | False | 0.312500 |
803632 | 4e7e6c74e3c57d | 94698690918933 | da91b0c3d875f8 | 0 | 0 | 196.8 | 0 | 0 | 0 | 51 | ... | 0.0000 | 61 | 1.39 | 273.552 | 11.12 | 654.69 | 0 | 3159.00 | False | 0.000000 |
895411 | 1f5ba6e0cfb968 | 512ea24b831be3 | 5fb0d8b1fc16cf | 4 | 0 | 1012.0 | 11 | 5 | 0 | 5 | ... | 0.9091 | 86 | 11.40 | 1153.680 | 13.68 | 1163.94 | 0 | 569.50 | False | 0.500000 |
1172437 | 303a93cfa1f46c | 8795d39fd0df86 | 9c8962b58bb3e3 | 2 | 1 | 329.3 | 0 | 0 | 2 | 45 | ... | 0.2857 | 58 | 4.26 | 467.606 | 11.36 | 825.02 | 3 | 832.50 | False | 0.000000 |
1209416 | 528659ff1c1aec | 7d1ba83423551d | ea9386587d5888 | 0 | 6 | 1640.0 | 0 | 7 | 0 | 1 | ... | 0.9412 | 52 | 22.20 | 2427.200 | 76.96 | 1827.80 | 6 | 2848.00 | False | 0.466667 |
1642712 | 91966848e08e2f | 0ee4fbd27657c9 | 17dea22cefe62a | 3 | 2 | 2103.0 | 0 | 4 | 11 | 11 | ... | 0.5000 | 28 | 39.56 | 3617.160 | 25.80 | 3092.56 | 13 | 235.30 | False | 0.173913 |
2015559 | 5ff0c1a9fab2ba | 2d8119b1544f87 | 904cecf36217df | 3 | 3 | 1302.0 | 0 | 6 | 5 | 15 | ... | 0.6000 | 42 | 17.38 | 2057.160 | 17.38 | 2834.52 | 8 | 133.20 | False | 0.545455 |
2122128 | 42df3102cb540b | 7d9b2be15b355b | 610d78f3affd2e | 5 | 0 | 2500.0 | 0 | 7 | 1 | 2 | ... | 0.0000 | 10 | 41.80 | 4750.000 | 3.80 | 3416.20 | 1 | 464.50 | False | 0.318182 |
2152425 | 4b9f61bac5eb0a | bc717b964f3bbe | 838cb9a3c94598 | 3 | 0 | 945.4 | 0 | 0 | 0 | 11 | ... | 0.5714 | 60 | 18.20 | 1323.560 | 11.20 | 1673.00 | 0 | 844.70 | False | 0.000000 |
2592718 | 24e0fec84c18e9 | 8404855ca02e48 | e886a8ebb702cf | 7 | 0 | 1684.0 | 0 | 4 | 7 | 11 | ... | 0.5714 | 26 | 22.62 | 2930.160 | 38.28 | 3118.08 | 7 | 4851.00 | False | 0.307692 |
2981715 | 7f77051c7cef52 | d6579a630399b5 | 4784f7d9a06b51 | 3 | 5 | 1025.0 | 5 | 2 | 5 | 2 | ... | 1.0000 | 93 | 6.42 | 1096.750 | 50.29 | 1453.06 | 10 | 4085.96 | False | 0.333333 |
3081503 | f19a76e8d7ac52 | 624d65c529f87c | de19b70121c40f | 3 | 0 | 1038.0 | 0 | 0 | 0 | 32 | ... | 0.8571 | 57 | 8.58 | 1484.340 | 11.44 | 945.23 | 0 | 270.00 | False | 0.000000 |
3255171 | 5524c154448425 | 674195558ad41b | db6f6d1f0d4904 | 1 | 0 | 1355.0 | 0 | 2 | 0 | 9 | ... | 0.5789 | 21 | 25.06 | 2425.450 | 35.80 | 1607.42 | 0 | 1039.00 | False | 0.142857 |
3304284 | d0c286ce498e17 | 17fdd45e612bab | 3eaaa2f7a360fe | 7 | 0 | 2330.0 | 0 | 2 | 0 | 2 | ... | 1.0000 | 53 | 29.40 | 3425.100 | 26.46 | 1321.53 | 0 | 68.02 | False | 0.100000 |
3320960 | 0040e53dfe7b5d | 650661c2351eb7 | 2daabf3a7852e6 | 0 | 0 | 399.0 | 2 | 0 | 6 | 14 | ... | 0.0000 | 15 | 7.40 | 738.150 | 14.80 | 2763.90 | 6 | 5481.00 | False | 0.000000 |
3552532 | db638834c62f6f | 0614b611d6a935 | ff80300f8262f5 | 2 | 0 | 517.0 | 0 | 0 | 0 | 10 | ... | 0.0000 | 30 | 8.50 | 878.900 | 6.80 | 612.00 | 0 | 1344.88 | False | 0.000000 |
4332473 | d8857d3d7e31b6 | 085de7a36897e6 | 42f997c16d8a0e | 5 | 0 | 1685.0 | 11 | 3 | 18 | 8 | ... | 0.9091 | 26 | 27.84 | 2931.900 | 20.88 | 3119.82 | 18 | 523.30 | False | 0.187500 |
20 rows × 38 columns
train[train["longestKill"] >=1000].index
Int64Index([ 202281, 240005, 324313, 656553, 803632, 895411, 1172437, 1209416, 1642712, 2015559, 2122128, 2152425, 2592718, 2981715, 3081503, 3255171, 3304284, 3320960, 3552532, 4332473], dtype='int64')
train.drop(train[train["longestKill"] >=1000].index, inplace=True)
train.shape
(4445287, 38)
异常值处理:删除关于运动距离的异常值
# 行走
train[train["walkDistance"] >=10000].index
Int64Index([ 23026, 34344, 49312, 68590, 94400, 125103, 136421, 136476, 154080, 154128, ... 4181311, 4230073, 4259976, 4284974, 4288445, 4306598, 4370543, 4380785, 4405009, 4415088], dtype='int64', length=219)
train.drop(train[train["walkDistance"] >=10000].index, inplace=True)
train.shape
(4445068, 38)
# 载具
train[train["rideDistance"] >=20000].index
Int64Index([ 28588, 63015, 70507, 72763, 95276, 140097, 297186, 371098, 403647, 426708, ... 4154459, 4191491, 4239725, 4248221, 4256764, 4270943, 4301013, 4386384, 4404738, 4440261], dtype='int64', length=150)
train.drop(train[train["rideDistance"] >=20000].index, inplace=True)
train.shape
(4444918, 38)
# 游泳
train[train["swimDistance"] >=2000].index
Int64Index([ 177973, 274258, 1005337, 1195818, 1227362, 1889163, 2065940, 2327586, 2784855, 3359439, 3513522, 4132225], dtype='int64')
train.drop(train[train["swimDistance"] >=20000].index, inplace=True)
train.shape
(4444918, 38)
异常值处理:武器收集异常值处理
train[train["weaponsAcquired"] >=80].index
Int64Index([ 233643, 588387, 1437471, 1449293, 1592744, 1834515, 2373240, 2442962, 2743408, 2749693, 2797867, 2973445, 2977084, 2982525, 3230315, 3405716, 3951710, 4022031, 4288517], dtype='int64')
train.drop(train[train["weaponsAcquired"] >=80].index, inplace=True)
train.shape
(4444899, 38)
异常值处理:删除使用治疗药品数量异常值
train[train["heals"] >=80].index
Int64Index([4262662], dtype=‘int64’)
train.drop(train[train["heals"] >=80].index, inplace=True)
train.shape
(4444898, 38)
类别型数据处理
比赛类型one-hot处理
train["matchType"].unique()
array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo', 'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp', 'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad', 'crashtpp', 'normal-solo'], dtype=object)
train = pd.get_dummies(train, columns=["matchType"])
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | matchType_normal-duo | matchType_normal-duo-fpp | matchType_normal-solo | matchType_normal-solo-fpp | matchType_normal-squad | matchType_normal-squad-fpp | matchType_solo | matchType_solo-fpp | matchType_squad | matchType_squad-fpp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 53 columns
matchType_encoding = train.filter(regex="matchType")
matchType_encoding.head()
matchType_crashfpp | matchType_crashtpp | matchType_duo | matchType_duo-fpp | matchType_flarefpp | matchType_flaretpp | matchType_normal-duo | matchType_normal-duo-fpp | matchType_normal-solo | matchType_normal-solo-fpp | matchType_normal-squad | matchType_normal-squad-fpp | matchType_solo | matchType_solo-fpp | matchType_squad | matchType_squad-fpp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
对groupId,matchId等数据进行处理
train["groupId"].head()
0 4d4b580de459be 1 684d5656442f9e 2 6a4a42c3245a74 3 a930a9c79cd721 4 de04010b3458dd Name: groupId, dtype: object
# train["groupId"].astype("category")
train["groupId"] = train["groupId"].astype("category")
train["groupId_cat"] = train["groupId"].cat.codes
train["groupId_cat"].head()
0 613619 1 827616 2 843307 3 1340122 4 1757411 Name: groupId_cat, dtype: int32
train["matchId"] = train["matchId"].astype("category")
train["matchId_cat"] = train["matchId"].cat.codes
train["matchId_cat"].head()
0 30085 1 32751 2 3143 3 45260 4 20531 Name: matchId_cat, dtype: int32
train.head()
Id | groupId | matchId | assists | boosts | damageDealt | DBNOs | headshotKills | heals | killPlace | ... | matchType_normal-solo | matchType_normal-solo-fpp | matchType_normal-squad | matchType_normal-squad-fpp | matchType_solo | matchType_solo-fpp | matchType_squad | matchType_squad-fpp | groupId_cat | matchId_cat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7f96b2f878858a | 4d4b580de459be | a10357fd1a4a91 | 0 | 0 | 0.00 | 0 | 0 | 0 | 60 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 613619 | 30085 |
1 | eef90569b9d03c | 684d5656442f9e | aeb375fc57110c | 0 | 0 | 91.47 | 0 | 0 | 0 | 57 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 827616 | 32751 |
2 | 1eaf90ac73de72 | 6a4a42c3245a74 | 110163d8bb94ae | 1 | 0 | 68.00 | 0 | 0 | 0 | 47 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 843307 | 3143 |
3 | 4616d365dd2853 | a930a9c79cd721 | f1f1f4ef412d7e | 0 | 0 | 32.90 | 0 | 0 | 0 | 75 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1340122 | 45260 |
4 | 315c96c26c9aac | de04010b3458dd | 6dc8ff871e21e6 | 0 | 0 | 100.00 | 0 | 0 | 0 | 45 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1757411 | 20531 |
5 rows × 55 columns
train.drop(["groupId", "matchId"], axis=1, inplace=True)
数据截取
取部分数据进行使用(100000)
df_sample = train.sample(100000)
df_sample.shape
(100000, 53)
确定特征值和目标值
df = df_sample.drop(["winPlacePerc", "Id"], axis=1)
y = df_sample["winPlacePerc"]
df.shape
(100000, 51)
y.shape
(100000,)
分割训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2)
X_train.shape
(80000, 51)
y_train.shape
(80000,)
机器学习(模型训练)和评估
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
使用随机森林对模型进行训练
初步使用随机森林进行模型训练
m1 = RandomForestRegressor(n_estimators=40,
min_samples_leaf=3,
max_features='sqrt',
n_jobs=-1)
m1.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=3, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1, oob_score=False, random_state=None, verbose=0, warm_start=False)
y_pre = m1.predict(X_valid)
m1.score(X_valid, y_valid)
0.907159951456783
mean_absolute_error(y_valid, y_pre)
0.06647584387091089
再次使用随机森林,进行模型训练
m1.feature_importances_
array([1.87658119e-03, 8.63746064e-02, 2.57685962e-02, 2.27532062e-03, 8.80290888e-04, 2.81013428e-02, 2.35573150e-01, 2.07083462e-03, 1.22714874e-02, 1.09702183e-02, 2.53353780e-02, 1.03767737e-02, 6.77020063e-03, 7.45172312e-03, 4.28638708e-03, 3.28563034e-03, 2.12925123e-02, 1.99967979e-05, 3.98400033e-03, 1.36372746e-04, 1.32592980e-04, 1.71940999e-01, 4.20790087e-02, 2.52040557e-03, 6.33213818e-03, 7.30402941e-03, 1.11477263e-02, 7.52171932e-03, 1.19465432e-02, 5.30382552e-02, 1.81348675e-01, 0.00000000e+00, 2.22747761e-03, 3.44612223e-05, 0.00000000e+00, 2.02507233e-04, 5.97533128e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.54286394e-05, 0.00000000e+00, 4.94182811e-07, 0.00000000e+00, 3.11300574e-04, 1.93840925e-04, 9.02659739e-04, 1.04955041e-03, 9.87217660e-04, 4.53628891e-03, 4.50774311e-03])
imp_df = pd.DataFrame({"cols":df.columns, "imp":m1.feature_importances_})
imp_df.head()
cols | imp | |
---|---|---|
0 | assists | 0.001877 |
1 | boosts | 0.086375 |
2 | damageDealt | 0.025769 |
3 | DBNOs | 0.002275 |
4 | headshotKills | 0.000880 |
imp_df = imp_df.sort_values("imp", ascending=False)
imp_df.head()
cols | imp | |
---|---|---|
6 | killPlace | 0.235573 |
30 | totalDistance | 0.181349 |
21 | walkDistance | 0.171941 |
1 | boosts | 0.086375 |
29 | healsandboosts | 0.053038 |
imp_df[:20].plot("cols", "imp", figsize=(20, 8), kind="barh")
to_keep = imp_df[imp_df.imp > 0.005].cols
to_keep.shape
(20,)
df_keep = df[to_keep]
X_train, X_valid, y_train, y_valid = train_test_split(df_keep, y, test_size=0.2)
X_train.shape
(80000, 20)
m2 = RandomForestRegressor(n_estimators=40,
min_samples_leaf=3,
max_features='sqrt',
n_jobs=-1)
m2.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=None, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=3, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1, oob_score=False, random_state=None, verbose=0, warm_start=False)
y_pre = m2.predict(X_valid)
m2.score(X_valid, y_valid)
0.9125654968172906
mean_absolute_error(y_valid, y_pre)
0.06408683094647326
使用lightGBM对模型进行训练
X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2)
X_train.shape
(80000, 51)
模型初次尝试
import lightgbm as lgb
gbm = lgb.LGBMRegressor(objective="regression", num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)
[1] valid_0's l1: 0.255801 valid_0's l2: 0.0863836 Training until validation scores don't improve for 5 rounds [2] valid_0's l1: 0.244604 valid_0's l2: 0.0792314 [3] valid_0's l1: 0.234038 valid_0's l2: 0.072761 [4] valid_0's l1: 0.224123 valid_0's l2: 0.0669453 [5] valid_0's l1: 0.214716 valid_0's l2: 0.0616647 [6] valid_0's l1: 0.205802 valid_0's l2: 0.0568409 [7] valid_0's l1: 0.197424 valid_0's l2: 0.0525102 [8] valid_0's l1: 0.189497 valid_0's l2: 0.0485595 [9] valid_0's l1: 0.18208 valid_0's l2: 0.0450087 [10] valid_0's l1: 0.175038 valid_0's l2: 0.0417809 [11] valid_0's l1: 0.168411 valid_0's l2: 0.0388494 [12] valid_0's l1: 0.162014 valid_0's l2: 0.0361473 [13] valid_0's l1: 0.156139 valid_0's l2: 0.0337388 [14] valid_0's l1: 0.150548 valid_0's l2: 0.031546 [15] valid_0's l1: 0.145259 valid_0's l2: 0.0295381 [16] valid_0's l1: 0.140261 valid_0's l2: 0.0277049 [17] valid_0's l1: 0.135596 valid_0's l2: 0.0260668 [18] valid_0's l1: 0.131269 valid_0's l2: 0.0245903 [19] valid_0's l1: 0.127159 valid_0's l2: 0.0232428 [20] valid_0's l1: 0.123315 valid_0's l2: 0.0220185 Did not meet early stopping. Best iteration is: [20] valid_0's l1: 0.123315 valid_0's l2: 0.0220185 LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.05, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=20, n_jobs=-1, num_leaves=31, objective='regression', random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
y_pre = gbm.predict(X_valid, num_iteration=gbm.best_iteration_)
mean_absolute_error(y_valid, y_pre)
0.12331524150224461
模型二次调优
from sklearn.model_selection import GridSearchCV
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
"learning_rate":[0.01, 0.1, 1],
"n_estimators":[40, 60, 80, 100, 200, 300]
}
gbm = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1)
gbm.fit(X_train, y_train)
GridSearchCV(cv=5, error_score=nan, estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0), iid='deprecated', n_jobs=-1, param_grid={'learning_rate': [0.01, 0.1, 1], 'n_estimators': [40, 60, 80, 100, 200, 300]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)
y_pre = gbm.predict(X_valid)
mean_absolute_error(y_valid, y_pre)
0.05685004010605751
gbm.best_params_
{‘learning_rate’: 0.1, ‘n_estimators’: 300}
模型三次调优
# n_estimators
scores = []
n_estimators = [100, 300, 500, 800]
for nes in n_estimators:
lgbm = lgb.LGBMRegressor(boosting_type='gbdt',
num_leaves=31,
max_depth=5,
learning_rate=0.1,
n_estimators=nes,
min_child_samples=20,
n_jobs=-1)
lgbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)
y_pre = lgbm.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pre)
scores.append(mae)
print("本次结果输出的mae值是:\n", mae)
本次结果输出的mae值是:
0.0566209902947507
plt.plot(n_estimators,scores,'o-')
plt.ylabel("mae")
plt.xlabel("n_estimator")
print("best n_estimator {}".format(n_estimators[np.argmin(scores)]))
best n_estimator 500
# max_depth
scores = []
max_depth = [3, 5, 7, 9, 11]
for md in max_depth:
lgbm = lgb.LGBMRegressor(boosting_type='gbdt',
num_leaves=31,
max_depth=md,
learning_rate=0.1,
n_estimators=500,
min_child_samples=20,
n_jobs=-1)
lgbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)
y_pre = lgbm.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pre)
scores.append(mae)
print("本次结果输出的mae值是:\n", mae)
本次结果输出的mae值是:
0.0571923061736829
plt.plot(max_depth,scores,'o-')
plt.ylabel("mae")
plt.xlabel("max_depths")
print("best max_depths {}".format(max_depth[np.argmin(scores)]))
best max_depths 5
scores
[0.058867698663447106,
0.0566209902947507,
0.05695850296967709,
0.057414793402343275,
0.0571923061736829]